# Creating a data set for model training from 2021 season data

In [None]:
#Imports
import statsapi
import datetime 
import pandas as pd
import json
from functools import reduce
import numpy as np

In [None]:
#Getting start date and end date of a season
def season_start_end(year):
    season_info = statsapi.get("seasons", {"sportId": 1, "season": year})
    season_start = season_info['seasons'][0]['regularSeasonStartDate']
    season_end = season_info['seasons'][0]['regularSeasonEndDate']
    return [season_start, season_end]

In [None]:
start_end_2021 = season_start_end(2021)
season_start = start_end_2021[0]
season_end = start_end_2021[1]

In [None]:
#Adding 35 days to the season start, when I want to start collecting data
season_start_offset = (datetime.datetime.strptime(season_start, "%Y-%m-%d") + datetime.timedelta(days=35)).strftime('%Y-%m-%d')

In [None]:
#Geting all season matchup outcomes
def season_match_outcomes(season_start, season_end):
    dict_list = []
    for i in statsapi.schedule(start_date=season_start, end_date=season_end):
        match_dict = {}
        try:
            match_dict['game_id'] = i['game_id']
            match_dict['win'] = i['winning_team']
            match_dict['loss'] = i['losing_team']
            dict_list.append(match_dict)
        except: pass
    return dict_list

In [None]:
#Setting variable to the output for season match outcomes
all_matches = season_match_outcomes(season_start_offset, season_end)

In [None]:
#Creating a list of all game ids for period in 2021 season
game_ids = [i['game_id'] for i in all_matches]
game_ids

In [None]:
game_info = statsapi.boxscore_data(632520, timecode=None)
game_info

## Creating json of all the 2021 matchups and rosters

In [None]:
#Takes a long time to run (16min) - There was a better way I am sure, a df would have been better, but using this for now
#Commenting out to not run again
#dict_list = []
#for i in game_ids:
#    game_info = statsapi.boxscore_data(i, timecode=None)
#    roster_dict = {}
#    roster_dict['game_id'] = int(i)
#    roster_dict['home_roster'] = game_info['home']['battingOrder'] + game_info['home']['bullpen']
#    roster_dict['away_roster'] = game_info['away']['battingOrder'] + game_info['away']['bullpen']
#    dict_list.append(roster_dict)
#dict_list

In [None]:
#Creating json from above dict_list
#Commenting out to not run again
#with open('roster_dict', 'w') as fout:
#    json.dump(dict_list, fout)

##### Adding date to data in a new json file
###### Remembered a bit later I needed the date to use my stats function 

In [None]:
#Takes a long time to run (16min) - There was a better way I am sure, a df would have been better, but using this for now
#Commenting out to not run again
#dict_list = []
#for i in game_ids:
#    game_info = statsapi.boxscore_data(i, timecode=None)
#    roster_dict = {}
#    roster_dict['game_id'] = int(i)
#    roster_dict['date'] = game_info['gameId'][0:10].replace('/', '-')
#    roster_dict['home_roster'] = game_info['home']['battingOrder'] + game_info['home']['bullpen']
#    roster_dict['away_roster'] = game_info['away']['battingOrder'] + game_info['away']['bullpen']
#    dict_list.append(roster_dict)
#dict_list

In [None]:
#Creating json from above dict_list
#Commenting out to not run again
#with open('roster_date_dict', 'w') as fout:
#    json.dump(dict_list, fout)

## Creating Player Stats function

In [None]:
#Creation of this function further down -  need to use it in the stats function

#Function to sum values in two dictionaries
def add_dicts(d1, d2):
    d3 = dict(d1)
    d3.update(d2)
    try:
        for i, j in d1.items():

            for x, y in d2.items():

                if (i == x):

                    d3[i]=(j+y)
    except: pass
    return d3

In [None]:
#Create function to get player stats
#Inputs are the id, start date and end date

#Currently not getting all the fielding stats, as there are more than one set of them - based on the pos they play - fix in progress


def player_stats(id,start_date, end_date, season):

    #Getting player stats - separately because the API switches their order when fetched at the same time 
    hitting_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[hitting],type=[byDateRange],startDate={start_date},endDate={end_date},season={season})"})
    pitching_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[pitching],type=[byDateRange],startDate={start_date},endDate={end_date},season={season})"})
    fielding_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[fielding],type=[byDateRange],startDate={start_date},endDate={end_date},season={season})"})

    #dict where I will add all the collected stats
    stat_dict = {}

    #Temporary storage for fielding dicts for all positions
    fielding_list = []

    #List to check if the stats for a fielding position have already been collected - as they are listed more than once
    pos_codes = []

    try:
        player_hitting_stats = hitting_stats['people'][0]['stats'][0]['splits'][0]['stat']
        stat_dict['hitting'] = player_hitting_stats
    except: pass
    
    try:
        player_pitching_stats = pitching_stats['people'][0]['stats'][0]['splits'][-1]['stat']
        stat_dict['pitching'] = player_pitching_stats
    except: pass

    try:
        for i in range(len(fielding_stats['people'][0]['stats'][0]['splits'])):
            if fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']['position']['code'] in pos_codes:
                pass
            else:
                pos_codes.append(fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']['position']['code'])
                fielding_list.append(fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']) 
                
        all_fielding = {}

        for i in range(len(fielding_list)):
            all_fielding = add_dicts(all_fielding, fielding_list[i])
        stat_dict['fielding'] = all_fielding
        
        
    except:pass
    return stat_dict

In [None]:
#Testing function for 2022
player_stats(518934,'2022-08-05','2022-08-12', 2022)

In [None]:
#Testing player stats for a pitcher - Found I needed to use [-1] in my function to get the cumulative pitching stats
player_stats(593974,'2021-04-01', '2021-05-06', 2021)

#### Various tests and examples trying to figure out how to get the above function working

In [None]:
#Checking length of fielding_stats
fielding_stats = statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})
len(fielding_stats['people'][0]['stats'][0]['splits'])

In [None]:
#Verifying location of position code for looping
statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})['people'][0]['stats'][0]['splits'][0]['stat']['position']['code']

In [None]:
#Verifying position of one player's single position fielding stats
statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[hitting,fielding,pitching],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})['people'][0]['stats'][1]['splits'][0]['stat']

In [None]:
#Looking at stats for Javier Baez who switched teams mid season.  Need to use abbreviation == All to get his stats, and combine them.
statsapi.get("people", {"personIds": 595879, "hydrate": "stats(group=[hitting,pitching,fielding],type=[byDateRange],\
    startDate=2021-04-05,endDate=2021-10-01,season=2021)"})

In [None]:
#Testing for Javier Baez - Only returning first entry of hitting 
player_stats(595879, '2021-04-05', '2021-10-01', 2021)

## Trying to add dictionaries together to get total stats

In [None]:
#Testing to see if I can combine fielding stats into a dict
#It works except for the 'position' dictionary, which I do not need the info from
#Also combines fielding as a string, but I can fix that later, if I decide to use that stat (thinking I will not for now)
d1 = statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})['people'][0]['stats'][0]['splits'][0]['stat']

d2 = statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})['people'][0]['stats'][0]['splits'][1]['stat']

d3 = dict(d1) # don't do `d3=d1`, you need to make a copy

d3.update(d2) 
try:
    for i, j in d1.items():

        for x, y in d2.items():

            if i == x:

                d3[i]=(j+y)
except: pass
print(d3)

In [None]:
#Testing for three dictionaries
d1 = statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})['people'][0]['stats'][0]['splits'][0]['stat']

d2 = statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})['people'][0]['stats'][0]['splits'][1]['stat']

d0 = statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})['people'][0]['stats'][0]['splits'][2]['stat']

d3 = dict(d1) # don't do `d3=d1`, you need to make a copy

d3.update(d2) 
try:
    for i, j in d1.items():

        for x, y in d2.items():

                if (i == x):

                    d3[i]=(j+y)
except: pass
d4 = dict(d3)
d4.update(d0)
try:
    for i, j in d3.items():

        for x, y in d0.items():

                if (i == x):

                    d4[i]=(j+y)
except: pass
print(d4)

In [None]:
#Making a function to shorten the code - will use this in the stats function to get all the relevant fielding stats added together
def add_dicts(d1, d2):
    d3 = dict(d1)
    d3.update(d2)
    try:
        for i, j in d1.items():

            for x, y in d2.items():

                if (i == x):

                    d3[i]=(j+y)
    except: pass
    return d3

In [None]:
#Testing add_dicts starting with an empty dictionary
empty_dict = {}
dict_list = [d1,d0,d2]
for i in range(len(dict_list)):
    empty_dict = add_dicts(empty_dict,dict_list[i])
empty_dict

In [None]:
#Getting the if statement to work inside the for loop of the player_stats function and 
#creating a list of unique fielding stats for a player (they duplicated for some reason)
temp_list = []
temp_list2 = []
fielding_stats = statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})
for i in range(len(fielding_stats['people'][0]['stats'][0]['splits'])):
    if fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']['position']['code'] in temp_list:
        pass
    else:
        temp_list.append(fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']['position']['code'])
        temp_list2.append(fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']) 
temp_list2

In [None]:
#Getting add_dicts function to work to combine all fielding stats into one dictionary - strings will be messed up
dict_A = {}
for i in range(len(temp_list2)):
    dict_A =  add_dicts(dict_A, temp_list2[i])
dict_A

## Pulling some stats for a team roster/matchup
###### Need to get the date of the game for the player_stats function to work properly - this date will be the end date, with the start of season offset date as the start

In [None]:
#reading json
f = open('roster_dict')
data = json.load(f)

In [None]:
#Looking at the first matchup 

#Getting relevant data from dictionary
matchup_one = data[0]
game_id = matchup_one['game_id']
home_roster = matchup_one['home_roster']
away_roster = matchup_one['away_roster']
date = statsapi.boxscore_data(game_id, timecode=None)['gameId'][0:10].replace('/', '-')

#Pulling stats for each player - had to look up date manually for now

for i in home_roster:
    print(player_stats(i,'2021-04-01', date, 2021))

### Troubleshooting again for player_stats, with pitching specifically
###### Was running into issues with pitchers being on the roster but not pitching for the team yet in the season

In [None]:
#Looking at how to get the total stats for a pitcher in the season so far (As they are often traded and a separate entry is created for each team)
pitching_stats = statsapi.get("people", {"personIds": 593974, "hydrate": f"stats(group=[pitching],type=[byDateRange],\
    startDate=2021-04-01,endDate=2021-05-06,season=2021)"})
pitching_stats['people'][0]['stats'][0]['splits'][-1]['stat']

In [None]:
#Looking for Gerrit Cole's stats with function - was returning {} before I fixed the function now
player_stats(543037,'2021-04-01', '2021-05-06', 2021)

In [None]:
#Looking for Gerrit Cole's stats straight from the API - works fine
statsapi.get("people", {"personIds": 543037, "hydrate": f"stats(group=[pitching],type=[byDateRange],\
    startDate=2021-04-01,endDate=2021-05-06,season=2021)"})['people'][0]['stats'][0]['splits'][-1]['stat']

In [None]:
#Testing for pitcher who swapped teams mid season
#Verifies that we only need the last entry for the cumulative pitching stats
statsapi.get("people", {"personIds": 458677, "hydrate": f"stats(group=[pitching],type=[byDateRange],\
    startDate=2021-04-01,endDate=2021-10-01,season=2021)"})

## Player Stats Function Final Vers

In [None]:
#Fixing function to get player stats
#Inputs are the id, start date and end date

#Fix using abbreviation == All

def player_stats(id,start_date, end_date, season):

    #Getting player stats - separately because the API switches their order when fetched at the same time 
    hitting_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[hitting],type=[byDateRange],startDate={start_date},endDate={end_date},season={season})"})
    pitching_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[pitching],type=[byDateRange],startDate={start_date},endDate={end_date},season={season})"})
    fielding_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[fielding],type=[byDateRange],startDate={start_date},endDate={end_date},season={season})"})

    #dict where I will add all the collected stats
    stat_dict = {}

    #Temporary storage for fielding dicts for all positions
    fielding_list = []

    #List to check if the stats for a fielding position have already been collected - as they are listed more than once
    pos_codes = []

    #Taking last entry of hitting stats, as that is the cumulative for the date range
    try:
        player_hitting_stats = hitting_stats['people'][0]['stats'][0]['splits'][-1]['stat']
        stat_dict['hitting'] = player_hitting_stats
    except: pass
    
    #Taking last entry of pitching stats, as that is the cumulative for the date range
    try:
        player_pitching_stats = pitching_stats['people'][0]['stats'][0]['splits'][-1]['stat']
        stat_dict['pitching'] = player_pitching_stats
    except: pass

    #Summing cumulative fielding stats for each position played (Team changes now accounted for)
    try:
        for i in range(len(fielding_stats['people'][0]['stats'][0]['splits'])):
            if fielding_stats['people'][0]['stats'][0]['splits'][i]['sport']['abbreviation'] == 'All':
                fielding_list.append(fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']) 
                
        all_fielding = {}

        for i in range(len(fielding_list)):
            all_fielding = add_dicts(all_fielding, fielding_list[i])
        stat_dict['fielding'] = all_fielding
        
        
    except:pass
    return stat_dict

In [None]:
#Testing for Javier Baez - Now works properly for hitting and fielding
player_stats(595879, '2021-04-20', '2021-10-01', 2021)

## Pulling Stats from roster_date_dict for a team matchup

In [None]:
#reading json
f = open('roster_date_dict')
roster_matchup = json.load(f)
roster_matchup[0]

In [None]:
#Looking at the first matchup 

#Getting relevant data from dictionary
first_game = roster_matchup[0]
date = first_game['date']
game_id = first_game['game_id']
home_roster = matchup_one['home_roster']
away_roster = matchup_one['away_roster']

#Pulling stats for each player - using season_start_end function to fetch the season start date

for i in home_roster:
    print(player_stats(i,season_start_end(2021)[0], date, 2021))

In [None]:
#Comparing some results of the function to a proper get request from the API
#Results match my function
roster_list = [518934,
   519317,
   592450,
   642180,
   650402,
   645801,
   543305,
   543309,
   458731,
   656061,
   570666,
   547973,
   593334,
   650633,
   446372,
   642528,
   656756,
   593974,
   592791]
for i in roster_list:
    print(statsapi.get("people", {"personIds": i, "hydrate": f"stats(group=[pitching,fielding,hitting],type=[byDateRange],\
    startDate=2021-04-01,endDate=2021-05-06,season=2021)"})['people'][0]['stats'])

## Pulling more specific stats from roster_date_dict for a player
###### Starting with only a few stats

In [None]:
#Pulling stats for one player and adding to a df for future use

#Stats we will be using for now
hitting_stats = ['runs', 'rbi', 'homeRuns', 'hits', 'avg', 'ops', 'groundIntoDoublePlay']

#Initializing empty df with specified column names
df = pd.DataFrame(columns = hitting_stats)

#Storing all stats as a list
stat_list = [player_stats(595879, '2021-04-05', '2021-10-01', 2021)['hitting'][i] for i in hitting_stats]

#appending stats to the df
df.loc[len(df)] = stat_list


In [None]:
#Testing looping through one roster - Need to be more efficient here, takes 20 seconds to execute

#Stats we will be using for now
hitting_stats = ['runs', 'rbi', 'homeRuns', 'hits', 'avg', 'ops', 'groundIntoDoublePlay']

#Initializing empty df with specified column names
df = pd.DataFrame(columns = hitting_stats)

player_list = [595879,518934]

for j in roster_list:

    #Storing all stats as a list - using roster_list from above
    stat_list = [player_stats(j, '2021-04-05', '2021-10-01', 2021)['hitting'][i] for i in hitting_stats]

    #appending stats to the df
    df.loc[len(df)] = stat_list

In [None]:
#Testing looping through a few players - Need to be more efficient here, takes about the same time as above to execute

#Stats we will be using for now
hitting_stats = ['runs', 'rbi', 'homeRuns', 'hits', 'avg', 'ops', 'groundIntoDoublePlay']

#Initializing empty df with specified column names
df = pd.DataFrame(columns = hitting_stats)


#Storing all stats as a list - using roster_list from above
stat_list = [[player_stats(j, '2021-04-05', '2021-10-01', 2021)['hitting'][i] for i in hitting_stats] for j in roster_list]

#appending stats to the df
for i in range(len(stat_list)):
    df.loc[len(df)] = stat_list[i]

## Testing putting data directly into a df - Test this for a team roster later

In [None]:
#Creating empty df from keys in the dicts of each data group - using players to fill

df = pd.DataFrame(columns = player_stats(518934, '2021-04-05', '2021-10-01', 2021)['hitting'].keys())
df2 = pd.DataFrame(columns = player_stats(518934, '2021-04-05', '2021-10-01', 2021)['fielding'].keys())
df3 = pd.DataFrame(columns = player_stats(656061, '2021-04-05', '2021-10-01', 2021)['pitching'].keys())

In [None]:
df.loc[len(df)] = player_stats(518934, '2021-04-05', '2021-10-01', 2021)['hitting']

In [None]:
df2.loc[len(df2)] = player_stats(518934, '2021-04-05', '2021-10-01', 2021)['fielding']

In [None]:
#Combining all the dfs
df = pd.concat([df,df2,df3],axis=1)

In [None]:
#Adding game_id to df
df['game_id'] = first_game['game_id']

In [None]:
df['game_id']

## Creating new json for use in creating organized team data

In [None]:
#Takes a long time to run (9min) - There was a better way I am sure, a df would have been better, but using this for now
#Commenting out to not run again
#dict_list = []
#for i in game_ids:
#    game_info = statsapi.boxscore_data(i, timecode=None)
#    roster_dict = {}
#    #General game info
#    roster_dict['game_id'] = int(i)
#    roster_dict['date'] = game_info['gameId'][0:10].replace('/', '-')
#    #Home team stats - hitting, starting pitcher, bullpen
#    roster_dict['home_id'] = game_info['teamInfo']['home']['id']
#    roster_dict['home_hitting'] = game_info['home']['batters'] + game_info['home']['bench'] + game_info['home']['bullpen']
#    roster_dict['home_sp'] = game_info['homePitchers'][1]['personId']
#    roster_dict['home_bullpen'] = game_info['home']['bullpen']
#    #Away team stats - hitting, starting pitcher, bullpen
#    roster_dict['away_id'] = game_info['teamInfo']['away']['id']
#    roster_dict['away_hitting'] = game_info['away']['batters'] + game_info['away']['bench'] + game_info['away']['bullpen']
#    roster_dict['away_sp'] = game_info['awayPitchers'][1]['personId']
#    roster_dict['away_bullpen'] = game_info['away']['bullpen']
#    dict_list.append(roster_dict)
#dict_list

In [None]:
#Creating json from above dict_list
#Commenting out to not run again
#with open('roster_by_group', 'w') as fout:
#    json.dump(dict_list, fout)

## Testing creation of team stats for one team

In [None]:
#reading json
f = open('roster_by_group')
roster_groups = json.load(f)

In [None]:
#Testing looping through one roster - Need to be more efficient here, takes 20 seconds to execute

#Stats we will be using for now
hitting_stats = ['hits', 'baseOnBalls', 'atBats', 'hitByPitch', 'sacFlies', 'doubles', 'triples', 'homeRuns', 'plateAppearances']

#Initializing empty df with specified column names
df = pd.DataFrame(columns = hitting_stats)


for j in roster_groups[0]['home_hitting']:
    try:
        #Storing all stats as a list - using roster_groups[0]['home_hitting']
        stat_list = [player_stats(j, '2021-04-05', '2021-10-01', 2021)['hitting'][i] for i in hitting_stats]

        #appending stats to the df
        df.loc[len(df)] = stat_list
    except: pass

#### Creating relevant stats from the data

In [None]:
#team batting avg
team_avg = round(df['hits'].sum()/df['atBats'].sum(),3)
team_avg

In [None]:
#Team on base percentage (ops)
team_obp = round((df['hits'].sum()+df['baseOnBalls'].sum()+df['hitByPitch'].sum())/df['plateAppearances'].sum(),3)
team_obp

In [None]:
#Team slugging percentage
team_slug = round((df['hits'].sum() + df['doubles'].sum() + df['triples'].sum()*2 + df['homeRuns'].sum()*3)/df['atBats'].sum(),3)
team_slug

##  Calculating difference in hitting stats for a team matchup

In [None]:
#Creating df of relevant home team hitting stats

#Stats we will be using for now
hitting_stats = ['hits', 'baseOnBalls', 'atBats', 'hitByPitch','doubles', 'triples', 'homeRuns', 'plateAppearances']
team_hit_stats = ['game_id','avg', 'obp', 'slug']

#Initializing empty df with specified column names
df = pd.DataFrame(columns = team_hit_stats)
hits = 0
baseOnBalls = 0
atBats = 0
hitByPitch = 0
doubles = 0
triples = 0
homeRuns= 0
plateAppearances = 0

for j in roster_groups[0]['home_hitting']:
    try:
        #Storing all stats as a list - using roster_groups[0]['home_hitting']
        stat_list = [player_stats(j, '2021-04-05', '2021-10-01', 2021)['hitting'][i] for i in hitting_stats]
        hits+= stat_list[0]
        baseOnBalls+= stat_list[1]
        atBats+= stat_list[2]
        hitByPitch+= stat_list[3]
        doubles+= stat_list[4]
        triples+= stat_list[5]
        homeRuns+= stat_list[6]
        plateAppearances+= stat_list[7]
    except: pass
        #appending stats to the df
df.loc[len(df)] = [int(roster_groups[0]['game_id']),round(hits/atBats,3),round((hits+baseOnBalls+hitByPitch)/plateAppearances,3),round((hits+doubles+triples*2+homeRuns*3)/atBats,3)]

In [None]:
#Creating a df of relevant away team hitting stats

#Stats we will be using for now
hitting_stats = ['hits', 'baseOnBalls', 'atBats', 'hitByPitch','doubles', 'triples', 'homeRuns', 'plateAppearances']
team_hit_stats = ['game_id','avg', 'obp', 'slug']

#Initializing empty df with specified column names
df_away = pd.DataFrame(columns = team_hit_stats)
hits = 0
baseOnBalls = 0
atBats = 0
hitByPitch = 0
doubles = 0
triples = 0
homeRuns= 0
plateAppearances = 0

for j in roster_groups[0]['away_hitting']:
    try:
        #Storing all stats as a list - using roster_groups[0]['home_hitting']
        stat_list = [player_stats(j, '2021-04-05', '2021-10-01', 2021)['hitting'][i] for i in hitting_stats]
        hits+= stat_list[0]
        baseOnBalls+= stat_list[1]
        atBats+= stat_list[2]
        hitByPitch+= stat_list[3]
        doubles+= stat_list[4]
        triples+= stat_list[5]
        homeRuns+= stat_list[6]
        plateAppearances+= stat_list[7]
    except: pass
        #appending stats to the df
df_away.loc[len(df)] = [int(roster_groups[0]['game_id']),round(hits/atBats,3),round((hits+baseOnBalls+hitByPitch)/plateAppearances,3),round((hits+doubles+triples*2+homeRuns*3)/atBats,3)]

In [None]:
#Finding difference of home and away team stats
df.set_index('game_id').subtract(df_away.set_index('game_id'), fill_value =0).reset_index()

## Creating a function to calculate the difference in hitting stats for a team matchup

In [None]:
#Creating team hitting stats function

#Function takes the game_id and whether you want home or away hitting stats as inputs
#returns a df with three columns of the calculated stats

def team_hitting_stats(game_id, start_date, end_date, year, type='home_hitting'):
    hitting_stats = ['hits', 'baseOnBalls', 'atBats', 'hitByPitch','doubles', 'triples', 'homeRuns', 'plateAppearances']
    team_hit_stats = ['game_id','avg', 'obp', 'slug']

    #Initializing empty df with specified column names
    df = pd.DataFrame(columns = team_hit_stats)
    hits = 0
    baseOnBalls = 0
    atBats = 0
    hitByPitch = 0
    doubles = 0
    triples = 0
    homeRuns= 0
    plateAppearances = 0
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:
            for j in roster_groups[i][type]:
                try:
                    #Storing all stats as a list - using roster_groups[i]['home_hitting']
                    stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]
                    hits+= stat_list[0]
                    baseOnBalls+= stat_list[1]
                    atBats+= stat_list[2]
                    hitByPitch+= stat_list[3]
                    doubles+= stat_list[4]
                    triples+= stat_list[5]
                    homeRuns+= stat_list[6]
                    plateAppearances+= stat_list[7]
                except: pass
                    #appending stats to the df
            df.loc[len(df)] = [game_id,round(hits/atBats,3),round((hits+baseOnBalls+hitByPitch)/plateAppearances,3),round((hits+doubles+triples*2+homeRuns*3)/atBats,3)]
    return df

In [None]:
#Testing hitting stats
team_hitting_stats(634224,'2021-04-01', '2021-05-06', 2021, type = 'home_hitting')

In [None]:
#Creating team hitting stats function - without need of a date input

#Function takes the game_id and whether you want home or away hitting stats as inputs
#returns a df with three columns of the calculated stats

def team_hitting_stats(game_id,year, type='home_hitting'):
    hitting_stats = ['hits', 'baseOnBalls', 'atBats', 'hitByPitch','doubles', 'triples', 'homeRuns', 'plateAppearances']
    team_hit_stats = ['game_id','avg', 'obp', 'slug']
    start_date = season_start_end(2021)[0]

    #Initializing empty df with specified column names
    df = pd.DataFrame(columns = team_hit_stats)
    hits = 0
    baseOnBalls = 0
    atBats = 0
    hitByPitch = 0
    doubles = 0
    triples = 0
    homeRuns= 0
    plateAppearances = 0
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:
            end_date = roster_groups[i]['date']
            for j in roster_groups[i][type]:
                try:
                    #Storing all stats as a list - using roster_groups[i]['home_hitting']
                    stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]
                    hits+= stat_list[0]
                    baseOnBalls+= stat_list[1]
                    atBats+= stat_list[2]
                    hitByPitch+= stat_list[3]
                    doubles+= stat_list[4]
                    triples+= stat_list[5]
                    homeRuns+= stat_list[6]
                    plateAppearances+= stat_list[7]
                except: pass
                    #appending stats to the df
            df.loc[len(df)] = [game_id,round(hits/atBats,3),round((hits+baseOnBalls+hitByPitch)/plateAppearances,3),round((hits+doubles+triples*2+homeRuns*3)/atBats,3)]
    return df

In [None]:
#Testing hitting stats for away team
team_hitting_stats(634224, 2021, type = 'home_hitting')

In [None]:
#Function that implements the team_hitting_stats function for both teams

#Takes the game_id as input
#Returns a df that is the results of home - away team stats.

def hitting_stats_diff(game_id, start_date, end_date, year):
    home_df = team_hitting_stats(game_id, year, type = 'home_hitting',)
    away_df = team_hitting_stats(game_id, year,type = 'away_hitting')
    return home_df.set_index('game_id').subtract(away_df.set_index('game_id'), fill_value =0).reset_index()
#Input is the game_id to look at
#returns a list of values for entry to a df
#Commented out for now as I build it

In [None]:
#Testing hitting stats diff function (takes 1 min)
hitting_stats_diff(634224,'2021-04-01', '2021-05-06', 2021)

## Creating a hopefully more efficient hitting_stats_diff function

In [None]:
#Function that calculates hitting stats for home team - away team
#Takes game_id and year as the input - maybe can remove the year input

def hitting_stats_diff(game_id,year):
    hitting_stats = ['hits', 'baseOnBalls', 'atBats', 'hitByPitch','doubles', 'triples', 'homeRuns', 'plateAppearances']
    team_hit_stats = ['game_id','avg', 'obp', 'slug']
    start_date = season_start_end(2021)[0]

    #Creating empty df for home team with specified column names
    df_h = pd.DataFrame(columns = team_hit_stats)

    #Initializing home stats variables
    h_hits = 0
    h_baseOnBalls = 0
    h_atBats = 0
    h_hitByPitch = 0
    h_doubles = 0
    h_triples = 0
    h_homeRuns= 0
    h_plateAppearances = 0

    #Initializing away stats variables
    df_a = pd.DataFrame(columns = team_hit_stats)

    #Initializing away stats variables
    a_hits = 0
    a_baseOnBalls = 0
    a_atBats = 0
    a_hitByPitch = 0
    a_doubles = 0
    a_triples = 0
    a_homeRuns= 0
    a_plateAppearances = 0

    #Checking through roster_groups list for the game_id
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:

            #Setting end_date for stats function (Date of the game)
            end_date = roster_groups[i]['date']

            #Getting stats for each player on the home team
            for j in roster_groups[i]['home_hitting']:
                try:
                    #Storing all stats as a list
                    stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]

                    #Adding the home team stats together for the time period
                    h_hits+= stat_list[0]
                    h_baseOnBalls+= stat_list[1]
                    h_atBats+= stat_list[2]
                    h_hitByPitch+= stat_list[3]
                    h_doubles+= stat_list[4]
                    h_triples+= stat_list[5]
                    h_homeRuns+= stat_list[6]
                    h_plateAppearances+= stat_list[7]
                except: pass

            #Getting stats for each player on the away team
            for j in roster_groups[i]['away_hitting']:
                try:
                    #Storing all stats as a list
                    stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]

                    #Adding the away team stats together for the time period
                    a_hits+= stat_list[0]
                    a_baseOnBalls+= stat_list[1]
                    a_atBats+= stat_list[2]
                    a_hitByPitch+= stat_list[3]
                    a_doubles+= stat_list[4]
                    a_triples+= stat_list[5]
                    a_homeRuns+= stat_list[6]
                    a_plateAppearances+= stat_list[7]
                except: pass

    #appending stats to their respective dfs
    df_h.loc[len(df_h)] = [game_id,round(h_hits/h_atBats,3),round((h_hits+h_baseOnBalls+h_hitByPitch)/h_plateAppearances,3),round((h_hits+h_doubles+h_triples*2+h_homeRuns*3)/h_atBats,3)]
    df_a.loc[len(df_a)] = [game_id,round(a_hits/a_atBats,3),round((a_hits+a_baseOnBalls+a_hitByPitch)/a_plateAppearances,3),round((a_hits+a_doubles+a_triples*2+a_homeRuns*3)/a_atBats,3)]
    #Returning the difference of home and away team stats
    return df_h.set_index('game_id').subtract(df_a.set_index('game_id'), fill_value =0).reset_index()

In [None]:
#Testing fucntion - takes 40s
hitting_stats_diff(634224,2021)

In [None]:
#Function that calculates hitting stats for home team - away team
#Takes game_id and year as the input - maybe can remove the year input

#Attempting to use a break statement to speed up - Maybe a few seconds faster - will test with timeit
#Took away year as an input - takes about 15-20s longer to run this way..

def hitting_stats_diff(game_id):
    #List of stats being pulled from API
    hitting_stats = ['hits', 'baseOnBalls', 'atBats', 'hitByPitch','doubles', 'triples', 'homeRuns', 'plateAppearances']

    #List of stats being calculated
    team_hit_stats = ['game_id','avg', 'obp', 'slug']

    #Creating empty df for home team with specified column names
    df_h = pd.DataFrame(columns = team_hit_stats)

    #Initializing home stats variables
    h_hits = 0
    h_baseOnBalls = 0
    h_atBats = 0
    h_hitByPitch = 0
    h_doubles = 0
    h_triples = 0
    h_homeRuns= 0
    h_plateAppearances = 0

    #Initializing away stats variables
    df_a = pd.DataFrame(columns = team_hit_stats)

    #Initializing away stats variables
    a_hits = 0
    a_baseOnBalls = 0
    a_atBats = 0
    a_hitByPitch = 0
    a_doubles = 0
    a_triples = 0
    a_homeRuns= 0
    a_plateAppearances = 0

    #Checking through roster_groups list for the game_id
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:

            #Setting end_date for stats function (Date of the game)
            end_date = roster_groups[i]['date']
            game = roster_groups[i]

            #year is used in the player_stats function, and is the first 4 characters of the date string
            year =  roster_groups[i]['date'][0:4]

            #Start date of 2021 season
            start_date = season_start_end(year)[0]

            #breaking loop at this game_id then calculating stats
            break

    #Getting stats for each player on the home team
    for j in game['home_hitting']:
        try:
            #Storing all stats as a list
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the home team stats together for the time period
            h_hits+= stat_list[0]
            h_baseOnBalls+= stat_list[1]
            h_atBats+= stat_list[2]
            h_hitByPitch+= stat_list[3]
            h_doubles+= stat_list[4]
            h_triples+= stat_list[5]
            h_homeRuns+= stat_list[6]
            h_plateAppearances+= stat_list[7]
        except: pass

    #Getting stats for each player on the away team
    for j in game['away_hitting']:
        try:
            #Storing all stats as a list
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the away team stats together for the time period
            a_hits+= stat_list[0]
            a_baseOnBalls+= stat_list[1]
            a_atBats+= stat_list[2]
            a_hitByPitch+= stat_list[3]
            a_doubles+= stat_list[4]
            a_triples+= stat_list[5]
            a_homeRuns+= stat_list[6]
            a_plateAppearances+= stat_list[7]
        except: pass

    #appending stats to their respective dfs
    df_h.loc[len(df_h)] = [game_id,round(h_hits/h_atBats,3),round((h_hits+h_baseOnBalls+h_hitByPitch)/h_plateAppearances,3),round((h_hits+h_doubles+h_triples*2+h_homeRuns*3)/h_atBats,3)]
    df_a.loc[len(df_a)] = [game_id,round(a_hits/a_atBats,3),round((a_hits+a_baseOnBalls+a_hitByPitch)/a_plateAppearances,3),round((a_hits+a_doubles+a_triples*2+a_homeRuns*3)/a_atBats,3)]
    #Returning the difference of home and away team stats
    return df_h.set_index('game_id').subtract(df_a.set_index('game_id'), fill_value =0).reset_index()

In [None]:
#Function that calculates hitting stats for home team - away team
#Takes game_id and year as the input - maybe can remove the year input

#Attempting to use a break statement to speed up - Maybe a few seconds faster - will test with timeit

def hitting_stats_diff(game_id,year):
    #List of stats being pulled from API
    hitting_stats = ['hits', 'baseOnBalls', 'atBats', 'hitByPitch','doubles', 'triples', 'homeRuns', 'plateAppearances']

    #List of stats being calculated
    team_hit_stats = ['game_id','avg', 'obp', 'slug']

    #Start date of 2021 season
    start_date = season_start_end(year)[0]

    #Creating empty df for home team with specified column names
    df_h = pd.DataFrame(columns = team_hit_stats)

    #Initializing home stats variables
    h_hits = 0
    h_baseOnBalls = 0
    h_atBats = 0
    h_hitByPitch = 0
    h_doubles = 0
    h_triples = 0
    h_homeRuns= 0
    h_plateAppearances = 0

    #Initializing away stats variables
    df_a = pd.DataFrame(columns = team_hit_stats)

    #Initializing away stats variables
    a_hits = 0
    a_baseOnBalls = 0
    a_atBats = 0
    a_hitByPitch = 0
    a_doubles = 0
    a_triples = 0
    a_homeRuns= 0
    a_plateAppearances = 0

    #Checking through roster_groups list for the game_id
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:

            #Setting end_date for stats function (Date of the game)
            end_date = roster_groups[i]['date']
            game = roster_groups[i]

            #breaking loop at this game_id then calculating stats
            break

    #Getting stats for each player on the home team
    for j in game['home_hitting']:
        try:
            #Storing all stats as a list
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the home team stats together for the time period
            h_hits+= stat_list[0]
            h_baseOnBalls+= stat_list[1]
            h_atBats+= stat_list[2]
            h_hitByPitch+= stat_list[3]
            h_doubles+= stat_list[4]
            h_triples+= stat_list[5]
            h_homeRuns+= stat_list[6]
            h_plateAppearances+= stat_list[7]
        except: pass

    #Getting stats for each player on the away team
    for j in game['away_hitting']:
        try:
            #Storing all stats as a list
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the away team stats together for the time period
            a_hits+= stat_list[0]
            a_baseOnBalls+= stat_list[1]
            a_atBats+= stat_list[2]
            a_hitByPitch+= stat_list[3]
            a_doubles+= stat_list[4]
            a_triples+= stat_list[5]
            a_homeRuns+= stat_list[6]
            a_plateAppearances+= stat_list[7]
        except: pass

    #appending stats to their respective dfs
    df_h.loc[len(df_h)] = [game_id,round(h_hits/h_atBats,3),round((h_hits+h_baseOnBalls+h_hitByPitch)/h_plateAppearances,3),round((h_hits+h_doubles+h_triples*2+h_homeRuns*3)/h_atBats,3)]
    df_a.loc[len(df_a)] = [game_id,round(a_hits/a_atBats,3),round((a_hits+a_baseOnBalls+a_hitByPitch)/a_plateAppearances,3),round((a_hits+a_doubles+a_triples*2+a_homeRuns*3)/a_atBats,3)]
    #Returning the difference of home and away team stats
    return df_h.set_index('game_id').subtract(df_a.set_index('game_id'), fill_value =0).reset_index()


In [None]:
#Testing fucntion - takes 40s - maybe a few seconds faster this way
hitting_stats_diff(633899,2021)

## Looking into calculating a rolling 10 day average


In [None]:
#10 day obp for one team

#For loop to grab relevant data points for a specific game_id in all 2021 games (roster_by_group)
count_home = 0
count_away = 0
home_prev_10 = []
away_prev_10 = []
for i in range(len(roster_groups)):
    if roster_groups[i]['game_id'] == 633899:

        #Initializing variables to use in the next loop
        date = roster_groups[i]['date']
        home_id = roster_groups[i]['home_id']
        away_id = roster_groups[i]['away_id']
        #starting position -  will find the entry in the roster_group list and update it as it loops below
        home_start = i
        away_start = i
        #Fixed_pos is used to find the entry number and use it as an unchanging boundry to get the previous 10 games
        fixed_pos = i

#Loops through roster_group and pulls the 10 previous game_ids for home team
for i in range(len(roster_groups)):
    if count_home < 10 and home_start < fixed_pos and roster_groups[home_start]['date']<=date and \
        (roster_groups[home_start]['home_id']==home_id or roster_groups[home_start]['away_id']==home_id):

        home_prev_10.append(roster_groups[home_start]['game_id'])
        print(roster_groups[home_start]['game_id'])
        print(roster_groups[home_start]['date'])
        print(roster_groups[home_start]['home_id'])
        print(roster_groups[home_start]['away_id'])
        print(home_start)
        count_home+=1
        home_start-=1
    elif count_home < 10: 
        home_start-=1
print(home_prev_10)

#Loops through roster_group and pulls the 10 previous game_ids for away team
for i in range(len(roster_groups)):
    if count_away < 10 and away_start < fixed_pos and roster_groups[away_start]['date']<=date and \
        (roster_groups[away_start]['home_id']==away_id or roster_groups[away_start]['away_id']==away_id):
        
        away_prev_10.append(roster_groups[away_start]['game_id'])
        print(roster_groups[away_start]['game_id'])
        print(roster_groups[away_start]['date'])
        print(roster_groups[away_start]['home_id'])
        print(roster_groups[away_start]['away_id'])
        print(away_start)
        count_away+=1
        away_start-=1
    elif count_away < 10: 
        away_start-=1
print(away_prev_10)

#### Creating function - to be run once for each team (home/away)

###### Currently returns a list of the previous 10 games for the inputted team id type (home_id/away_id)

In [None]:
#10 day obp for one team

def previous_10_games(game_id, type_id = 'home_id'):
#For loop to grab relevant data points for a specific game_id in all 2021 games (roster_by_group)
    count = 0
    prev_10 = []
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:

            #Initializing variables to use in the next loop
            date = roster_groups[i]['date']
            id = roster_groups[i][type_id]

            #starting position -  will find the entry in the roster_group list and update it as it loops below
            home_start = i

            #Fixed_pos is used to find the entry number and use it as an unchanging boundry to get the previous 10 games
            fixed_pos = i

    #Loops through roster_group and pulls the 10 previous game_ids for home team
    for i in range(len(roster_groups)):
        if count < 10 and home_start < fixed_pos and roster_groups[home_start]['date']<=date and \
            (roster_groups[home_start]['home_id']==id or roster_groups[home_start]['away_id']==id):

            prev_10.append(roster_groups[home_start]['game_id'])
            count+=1
            home_start-=1
        elif count < 10: 
            home_start-=1
    return prev_10

In [None]:
#Not using this function - found more efficient method

#Building up the 10 day obp function


def obp_rolling_10(game_id, type_id = 'home_id'):
#For loop to grab relevant data points for a specific game_id in all 2021 games (roster_by_group)
    count = 0
    prev_10 = []
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:

            #Initializing variables to use in the next loop
            date = roster_groups[i]['date']
            id = roster_groups[i][type_id]
            #year is used in the player_stats function, and is the first 4 characters of the date string
            year = year = roster_groups[i]['date'][0:4]
            #starting position -  will find the entry in the roster_group list and update it as it loops below
            home_start = i

            #Fixed_pos is used to find the entry number and use it as an unchanging boundry to get the previous 10 games
            fixed_pos = i

    #Loops through roster_group and pulls the 10 previous game_ids for home team
    for i in range(len(roster_groups)):
        if count < 10 and home_start < fixed_pos and roster_groups[home_start]['date']<=date and \
            (roster_groups[home_start]['home_id']==id or roster_groups[home_start]['away_id']==id):

            prev_10.append(roster_groups[home_start]['game_id'])
            count+=1
            home_start-=1
        elif count < 10: 
            home_start-=1
    
    #Creating df with obp_10 as the only column
    df = pd.DataFrame(columns = ['game_id','obp_10'])
    
    #Initializing needed stats to create obp
    hits = 0
    baseOnBalls = 0
    hitByPitch = 0
    plateAppearances = 0
    ops_stats_list = ['hits', 'baseOnBalls','hitByPitch', 'plateAppearances']
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] in prev_10:
            if roster_groups[i]['home_id'] == id:
                date = roster_groups[i]['date']
                print(roster_groups[i]['game_id'])
                print(date)
                print(hits)
                for j in roster_groups[i]['home_hitting']:
                    try:
            #Storing all stats as a list - using roster_groups[i]['home_hitting']
                        stat_list = [player_stats(j, date, date, year)['hitting'][i] for i in ops_stats_list]
                        hits+= stat_list[0]
                        baseOnBalls+= stat_list[1]
                        hitByPitch+= stat_list[2]
                        plateAppearances+= stat_list[3]
                    except: pass
            elif roster_groups[i]['away_id'] == id:
                print(roster_groups[i]['away_id'])
                print(roster_groups[i]['game_id'])
                print(hits)
                date_2 = roster_groups[i]['date']
                print(date_2)
                for g in roster_groups[i]['away_hitting']:
                    try:
            #Storing all stats as a list - using roster_groups[i]['home_hitting']
                        stat_list_2 = [player_stats(g, date_2, date_2, year)['hitting'][i] for i in ops_stats_list]
                        print(stat_list_2)
                        hits+= stat_list_2[0]
                        baseOnBalls+= stat_list_2[1]
                        hitByPitch+= stat_list_2[2]
                        plateAppearances+= stat_list_2[3]
                    except: pass
    df.loc[len(df)] = [game_id, round((hits+baseOnBalls+hitByPitch)/plateAppearances,3)]
    print(hits)
    return df


## Here is the working function for 10 day obp average

###### Will have to run it for each game, for each team, to get a rolling 10 day average

In [None]:
#10 day obp average

#using this, as it is more efficient

def obp_rolling_10(game_id, type_id = 'home_id'):
#For loop to grab relevant data points for a specific game_id in all 2021 games (roster_by_group)
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:
            this_game = roster_groups[i]
            #Initializing variables to use in the next loop
            end_date = roster_groups[i]['date']
            id = roster_groups[i][type_id]
            #year is used in the player_stats function, and is the first 4 characters of the date string
            year = year = roster_groups[i]['date'][0:4]
        else: pass
    
    #Creating df with obp_10 as the only column
    df = pd.DataFrame(columns = ['game_id','obp_10'])
    
    #Initializing needed stats to create obp
    start_date = (datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=-9)).strftime('%Y-%m-%d')
    hits = 0
    baseOnBalls = 0
    hitByPitch = 0
    plateAppearances = 0
    ops_stats_list = ['hits', 'baseOnBalls','hitByPitch', 'plateAppearances']
    if type_id == 'home_id':
        for j in this_game['home_hitting']:
            try:
                #Storing all stats as a list - using roster_groups[i]['home_hitting']
                stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in ops_stats_list]
                hits+= stat_list[0]
                baseOnBalls+= stat_list[1]
                hitByPitch+= stat_list[2]
                plateAppearances+= stat_list[3]
            except: pass
    elif type_id == 'away_id':
        for j in this_game['away_hitting']:
            try:
                #Storing all stats as a list - using roster_groups[i]['home_hitting']
                stat_list = [player_stats(j, start_date, end_date,year)['hitting'][i] for i in ops_stats_list]
                hits+= stat_list[0]
                baseOnBalls+= stat_list[1]
                hitByPitch+= stat_list[2]
                plateAppearances+= stat_list[3] 
            except: pass        
    df.loc[len(df)] = [game_id,round((hits+baseOnBalls+hitByPitch)/plateAppearances,3)]
    return df

In [None]:
#Testing - takes 22-28s to run
obp_rolling_10(633899, 'home_id')

In [None]:
#Testing for away team
obp_rolling_10(633899, 'away_id')

## Creating slug_rolling_10

In [None]:
#10 day slugging average

def slug_rolling_10(game_id, type_id = 'home_id'):
#For loop to grab relevant data points for a specific game_id in all 2021 games (roster_by_group)
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:
            this_game = roster_groups[i]
            #Initializing variables to use in the next loop
            end_date = roster_groups[i]['date']
            id = roster_groups[i][type_id]
            #year is used in the player_stats function, and is the first 4 characters of the date string
            year = roster_groups[i]['date'][0:4]
        else: pass
    
    #Creating df with obp_10 as the only column
    df = pd.DataFrame(columns = ['game_id','slug_10'])
    
    #Getting start date - 9 days before the end_date
    start_date = (datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=-9)).strftime('%Y-%m-%d')

    #Initializing needed stats to create obp
    hits = 0
    doubles = 0
    triples = 0
    homeRuns = 0
    atBats = 0
    ops_stats_list = ['hits', 'doubles','triples', 'homeRuns', 'atBats']
    if type_id == 'home_id':
        for j in this_game['home_hitting']:
            try:
                #Storing all stats as a list - using roster_groups[i]['home_hitting']
                stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in ops_stats_list]
                hits+= stat_list[0]
                doubles+= stat_list[1]
                triples+= stat_list[2]
                homeRuns+= stat_list[3]
                atBats+= stat_list[4]
            except: pass
    elif type_id == 'away_id':
        for j in this_game['away_hitting']:
            try:
                #Storing all stats as a list - using roster_groups[i]['home_hitting']
                stat_list = [player_stats(j, start_date, end_date,year)['hitting'][i] for i in ops_stats_list]
                hits+= stat_list[0]
                doubles+= stat_list[1]
                triples+= stat_list[2]
                homeRuns+= stat_list[3]
                atBats+= stat_list[4]
            except: pass        
    df.loc[len(df)] = [game_id,round((hits+doubles+triples*2+homeRuns*3)/atBats,3)]
    return df

In [None]:
slug_rolling_10(633899, 'home_id')

In [None]:
slug_rolling_10(633899, 'away_id')

## Creating obp_rolling_10_diff

###### This will be used over obp_rolling_10

In [None]:
#10 day obp average home team - away team

def obp_rolling_10_diff(game_id):
#For loop to grab relevant data points for a specific game_id in all 2021 games (roster_by_group)
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:
            game = roster_groups[i]
            #Initializing variables to use in the next loop
            end_date = roster_groups[i]['date']
            #year is used in the player_stats function, and is the first 4 characters of the date string
            year = roster_groups[i]['date'][0:4]
            break
    #Getting start_date - 9 days before the date of the game
    start_date = (datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=-9)).strftime('%Y-%m-%d')

    #Creating df for home team obp_10
    df_h = pd.DataFrame(columns = ['game_id','obp_10'])

    #Creating df for away team obp_10
    df_a = pd.DataFrame(columns = ['game_id','obp_10'])

    #List of hitting stats needed to calculate obp
    obp_stats_list = ['hits', 'baseOnBalls','hitByPitch', 'plateAppearances']
    
    #Initializing home team stats
    h_hits = 0
    h_baseOnBalls = 0
    h_hitByPitch = 0
    h_plateAppearances = 0

    #Initializing away team stats
    a_hits = 0
    a_baseOnBalls = 0
    a_hitByPitch = 0
    a_plateAppearances = 0

    for j in game['home_hitting']:
        try:
            #Adding home team stats together for the 10 days
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in obp_stats_list]
            h_hits+= stat_list[0]
            h_baseOnBalls+= stat_list[1]
            h_hitByPitch+= stat_list[2]
            h_plateAppearances+= stat_list[3]
        except: pass
    for j in game['away_hitting']:
        try:
            #Adding away team stats together for the 10 days
            stat_list = [player_stats(j, start_date, end_date,year)['hitting'][i] for i in obp_stats_list]
            a_hits+= stat_list[0]
            a_baseOnBalls+= stat_list[1]
            a_hitByPitch+= stat_list[2]
            a_plateAppearances+= stat_list[3] 
        except: pass    

    #Appending stats to their respective dfs  
    df_h.loc[len(df_h)] = [game_id,round((h_hits+h_baseOnBalls+h_hitByPitch)/h_plateAppearances,3)]
    df_a.loc[len(df_a)] = [game_id,round((a_hits+a_baseOnBalls+a_hitByPitch)/a_plateAppearances,3)]
    
    #Returning the difference of home and away team stats
    return df_h.set_index('game_id').subtract(df_a.set_index('game_id'), fill_value =0).reset_index()

In [None]:
#Testing - takes about the same amount of time as running obp_rolling_10 twice, but does a little more
obp_rolling_10_diff(633899)

## Creating slug_rolling_10_diff

##### This will be used over slug_rolling_10

In [None]:
#10 day slugging home team - away team

def slug_rolling_10_diff(game_id):
#For loop to grab relevant data points for a specific game_id in all 2021 games (roster_by_group)
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:
            game = roster_groups[i]
            #Initializing variables to use in the next loop
            end_date = roster_groups[i]['date']
            #year is used in the player_stats function, and is the first 4 characters of the date string
            year = roster_groups[i]['date'][0:4]
            break
    
    #Getting start date - 9 days before the end_date
    start_date = (datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=-9)).strftime('%Y-%m-%d')

    #Creating list of all stats needed to calculate slugging
    slug_stats_list = ['hits', 'doubles','triples', 'homeRuns', 'atBats']

    #Creating df for home team slugging
    df_h = pd.DataFrame(columns = ['game_id','slug_10'])

    #Creating df for away team slugging
    df_a = pd.DataFrame(columns = ['game_id','slug_10'])
    

    #Initializing stats for home team
    h_hits = 0
    h_doubles = 0
    h_triples = 0
    h_homeRuns = 0
    h_atBats = 0

    #Initializing stats for away team
    a_hits = 0
    a_doubles = 0
    a_triples = 0
    a_homeRuns = 0
    a_atBats = 0

    for j in game['home_hitting']:
        try:
            #Storing all stats as a list 
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in slug_stats_list]

            #Adding home team stats together for the 10 days
            h_hits+= stat_list[0]
            h_doubles+= stat_list[1]
            h_triples+= stat_list[2]
            h_homeRuns+= stat_list[3]
            h_atBats+= stat_list[4]
        except: pass

    for j in game['away_hitting']:
        try:
            #Adding away team stats together for the 10 days
            stat_list = [player_stats(j, start_date, end_date,year)['hitting'][i] for i in slug_stats_list]
            a_hits+= stat_list[0]
            a_doubles+= stat_list[1]
            a_triples+= stat_list[2]
            a_homeRuns+= stat_list[3]
            a_atBats+= stat_list[4]
        except: pass      
    
    #Appending stats to their respective dfs
    df_h.loc[len(df_h)] = [game_id,round((h_hits+h_doubles+h_triples*2+h_homeRuns*3)/h_atBats,3)]
    df_a.loc[len(df_a)] = [game_id,round((a_hits+a_doubles+a_triples*2+a_homeRuns*3)/a_atBats,3)]

    #Returning the difference of home and away team stats
    return df_h.set_index('game_id').subtract(df_a.set_index('game_id'), fill_value =0).reset_index()

In [None]:
#Testing function - takes 1 min to run
slug_rolling_10_diff(633899)

## Creating function to calculate team pitching stats up to a specific date in the season

##### e.g. the day of the game 

In [None]:
#Creating team pitching stats function - input dates needed here

#Function takes the game_id and whether you want home or away hitting stats as inputs
#returns a df with five columns of the calculated stats

def team_pitching_stats_dates(game_id, start_date, end_date, year, type='home'):
    pitching_stats = ['runs', 'inningsPitched', 'baseOnBalls', 'hits']
    team_pitch_stats = ['game_id','era_sp', 'whip_sp', 'era_bp', 'whip_bp']

    #Initializing empty df with specified column names
    df = pd.DataFrame(columns = team_pitch_stats)
    runs = 0
    inningsPitched = 0
    walks = 0
    hits = 0
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:
            #Check for game_id
            if type == 'home':
                #Getting starting pitcher
                sp = roster_groups[i]['home_sp']

                #Getting date of the game
                date = roster_groups[i]['date']

                #Getting stats of all players in the bullpen for the home team this game
                for j in roster_groups[i]['home_bullpen']:
                    try:
                        #Storing all stats as a list - using roster_groups[i]['home_bullpen']
                        stat_list = [player_stats(j, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

                        #Changing string to float
                        stat_list[1] = float(stat_list[1])

                        #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
                        if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                            stat_list[1] = int(stat_list[1]) + 0.666
                        elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                            stat_list[1] = int(stat_list[1]) + 0.333

                        runs+= stat_list[0]
                        inningsPitched+= stat_list[1]
                        walks+= stat_list[2]
                        hits+= stat_list[3]
                    except: pass


                stat_list = [player_stats(sp, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

                #Changing string to float
                stat_list[1] = float(stat_list[1])

                #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
                if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                    stat_list[1] = int(stat_list[1]) + 0.666
                elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                        stat_list[1] = int(stat_list[1]) + 0.333

                sp_runs = stat_list[0]
                sp_inningsPitched = stat_list[1]
                sp_walks = stat_list[2]
                sp_hits = stat_list[3]
            elif type == 'away':
                #Getting starting pitcher
                sp = roster_groups[i]['away_sp']
                for j in roster_groups[i]['away_bullpen']:
                    try:
                        #Storing all stats as a list - using roster_groups[i]['home_bullpen']
                        stat_list = [player_stats(j, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

                        #Changing string to float
                        stat_list[1] = float(stat_list[1])

                        #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
                        if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                            stat_list[1] = int(stat_list[1]) + 0.666
                        elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                            stat_list[1] = int(stat_list[1]) + 0.333

                        runs+= stat_list[0]
                        inningsPitched+= stat_list[1]
                        walks+= stat_list[2]
                        hits+= stat_list[3]
                    except: pass

                stat_list = [player_stats(sp, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

                #Changing string to float
                stat_list[1] = float(stat_list[1])

                #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
                if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                    stat_list[1] = int(stat_list[1]) + 0.666
                elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                    stat_list[1] = int(stat_list[1]) + 0.333

                sp_runs = stat_list[0]
                sp_inningsPitched = stat_list[1]
                sp_walks = stat_list[2]
                sp_hits = stat_list[3]
    #appending stats to the df
    df.loc[len(df)] = [game_id,round(sp_runs*9/sp_inningsPitched,3),round((sp_hits+sp_walks)/sp_inningsPitched,3),round(runs*9/inningsPitched,3),round((hits+walks)/inningsPitched,3)]
    return df

In [None]:
#Creating team pitching stats function - No date inputs needed

#Function takes the game_id and whether you want home or away hitting stats as inputs
#returns a df with five columns of the calculated stats

def team_pitching_stats(game_id, year, type='home'):
    pitching_stats = ['runs', 'inningsPitched', 'baseOnBalls', 'hits']
    team_pitch_stats = ['game_id','era_sp', 'whip_sp', 'era_bp', 'whip_bp']

    #Initializing empty df with specified column names
    df = pd.DataFrame(columns = team_pitch_stats)
    runs = 0
    inningsPitched = 0
    walks = 0
    hits = 0

    #Getting start date of the season
    start_date = season_start_end(2021)[0]
     
    #Checking for the inputted game id in the list of dicts of games
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:

            #Getting date of the game
            end_date = roster_groups[i]['date']
            #Check for home team
            if type == 'home':
                #Getting starting pitcher
                sp = roster_groups[i]['home_sp']

                #Getting stats of all players in the bullpen for the home team this game
                for j in roster_groups[i]['home_bullpen']:
                    try:
                        #Storing all stats as a list - using roster_groups[i]['home_bullpen']
                        stat_list = [player_stats(j, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

                        #Changing string to float
                        stat_list[1] = float(stat_list[1])

                        #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
                        if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                            stat_list[1] = int(stat_list[1]) + 0.666
                        elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                            stat_list[1] = int(stat_list[1]) + 0.333

                        #Summing stats for each team (players) for each game
                        runs+= stat_list[0]
                        inningsPitched+= stat_list[1]
                        walks+= stat_list[2]
                        hits+= stat_list[3]
                    except: pass

                #Getting stats for the starting pitcher
                stat_list = [player_stats(sp, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

                #Changing string to float
                stat_list[1] = float(stat_list[1])

                #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
                if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                    stat_list[1] = int(stat_list[1]) + 0.666
                elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                        stat_list[1] = int(stat_list[1]) + 0.333

                #Summing stats for starting pitcher
                sp_runs = stat_list[0]
                sp_inningsPitched = stat_list[1]
                sp_walks = stat_list[2]
                sp_hits = stat_list[3]
            
            #Check for away yeam
            elif type == 'away':
                #Getting starting pitcher
                sp = roster_groups[i]['away_sp']
                for j in roster_groups[i]['away_bullpen']:
                    try:
                        #Storing all stats as a list - using roster_groups[i]['home_bullpen']
                        stat_list = [player_stats(j, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

                        #Changing string to float
                        stat_list[1] = float(stat_list[1])

                        #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
                        if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                            stat_list[1] = int(stat_list[1]) + 0.666
                        elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                            stat_list[1] = int(stat_list[1]) + 0.333

                        runs+= stat_list[0]
                        inningsPitched+= stat_list[1]
                        walks+= stat_list[2]
                        hits+= stat_list[3]
                    except: pass

                stat_list = [player_stats(sp, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

                #Changing string to float
                stat_list[1] = float(stat_list[1])

                #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
                if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                    stat_list[1] = int(stat_list[1]) + 0.666
                elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                    stat_list[1] = int(stat_list[1]) + 0.333

                sp_runs = stat_list[0]
                sp_inningsPitched = stat_list[1]
                sp_walks = stat_list[2]
                sp_hits = stat_list[3]
    #appending stats to the df
    df.loc[len(df)] = [game_id,round(sp_runs*9/sp_inningsPitched,3),round((sp_hits+sp_walks)/sp_inningsPitched,3),round(runs*9/inningsPitched,3),round((hits+walks)/inningsPitched,3)]
    return df

In [None]:
team_pitching_stats(633899, 2021,'away')

In [None]:
team_pitching_stats(633899, 2021,'home')

In [None]:
#Creating pitching_stats_diff function

#Function takes the game_id and year as inputs
#returns a df of home team - away team stats

def pitching_stats_diff(game_id, year):
    pitching_stats = ['runs', 'inningsPitched', 'baseOnBalls', 'hits']
    team_pitch_stats = ['game_id','era_sp', 'whip_sp', 'era_bp', 'whip_bp']

    #Creating empty df for home team with specified column names
    df_h = pd.DataFrame(columns = team_pitch_stats)

    #Initializing home team stats variables
    h_runs = 0
    h_inningsPitched = 0
    h_walks = 0
    h_hits = 0

    #Creating empty df for away teamwith specified column names
    df_a = pd.DataFrame(columns = team_pitch_stats)

    #Initializing home team stats variables
    a_runs = 0
    a_inningsPitched = 0
    a_walks = 0
    a_hits = 0

    #Getting start date of the season
    start_date = season_start_end(2021)[0]
     
    #Checking for the inputted game id in the list of dicts of games
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:

            #Getting date of the game
            end_date = roster_groups[i]['date']

            #Getting home starting pitcher
            h_sp = roster_groups[i]['home_sp']

            #Getting away starting pitcher
            a_sp = roster_groups[i]['away_sp']

            #Getting stats of all players in the bullpen for the home team this game
            for j in roster_groups[i]['home_bullpen']:
                try:
                    #Getting all home team stats
                    stat_list = [player_stats(j, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

                    #Changing string to float
                    stat_list[1] = float(stat_list[1])

                    #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
                    if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                        stat_list[1] = int(stat_list[1]) + 0.666
                    elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                        stat_list[1] = int(stat_list[1]) + 0.333

                    #Summing stats for each team (players) for each game
                    h_runs+= stat_list[0]
                    h_inningsPitched+= stat_list[1]
                    h_walks+= stat_list[2]
                    h_hits+= stat_list[3]
                except: pass

            #Getting stats for the starting pitcher
            stat_list = [player_stats(h_sp, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

            #Changing string to float
            stat_list[1] = float(stat_list[1])

            #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
            if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                stat_list[1] = int(stat_list[1]) + 0.666
            elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                    stat_list[1] = int(stat_list[1]) + 0.333

            #Summing stats for starting pitcher
            h_sp_runs = stat_list[0]
            h_sp_inningsPitched = stat_list[1]
            h_sp_walks = stat_list[2]
            h_sp_hits = stat_list[3]
        
            for j in roster_groups[i]['away_bullpen']:
                try:
                    #Getting all away team stats
                    stat_list = [player_stats(j, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

                    #Changing string to float
                    stat_list[1] = float(stat_list[1])

                    #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
                    if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                        stat_list[1] = int(stat_list[1]) + 0.666
                    elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                        stat_list[1] = int(stat_list[1]) + 0.333

                    a_runs+= stat_list[0]
                    a_inningsPitched+= stat_list[1]
                    a_walks+= stat_list[2]
                    a_hits+= stat_list[3]
                except: pass

            stat_list = [player_stats(a_sp, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

            #Changing string to float
            stat_list[1] = float(stat_list[1])

            #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
            if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                stat_list[1] = int(stat_list[1]) + 0.666
            elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                stat_list[1] = int(stat_list[1]) + 0.333

            a_sp_runs = stat_list[0]
            a_sp_inningsPitched = stat_list[1]
            a_sp_walks = stat_list[2]
            a_sp_hits = stat_list[3]
    #appending stats to the df
    df_h.loc[len(df)] = [game_id,round(h_sp_runs*9/h_sp_inningsPitched,3),round((h_sp_hits+h_sp_walks)/h_sp_inningsPitched,3),round(h_runs*9/h_inningsPitched,3),round((h_hits+h_walks)/h_inningsPitched,3)]
    df_a.loc[len(df)] = [game_id,round(a_sp_runs*9/a_sp_inningsPitched,3),round((a_sp_hits+a_sp_walks)/a_sp_inningsPitched,3),round(a_runs*9/a_inningsPitched,3),round((a_hits+a_walks)/a_inningsPitched,3)]
    return df_h.set_index('game_id').subtract(df_a.set_index('game_id'), fill_value =0).reset_index()

In [None]:
#Testing function - takes 22s
pitching_stats_diff(633899,2021)

## Creating a whip_rolling_10_diff function

##### This function will only include the players on the 26-man roster for this game.
##### If a player played in the last 10 games, but got demoted/traded, their stats will not be considered in the 10 day averages

In [None]:
def whip_rolling_10_diff(game_id):
#For loop to grab relevant data points for a specific game_id in all 2021 games (roster_by_group)
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:
            
            game = roster_groups[i]

            #End date - date of the game input
            end_date = roster_groups[i]['date']

            #year is used in the player_stats function, and is the first 4 characters of the date string
            year = roster_groups[i]['date'][0:4]

            #Getting home starting pitcher
            h_sp = roster_groups[i]['home_sp']

            #Getting away starting pitcher
            a_sp = roster_groups[i]['away_sp']
            break
    
    #Getting start date - 9 days before the end_date
    start_date = (datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=-9)).strftime('%Y-%m-%d')

    #Creating list of all stats needed to calculate whip
    whip_stats_list = ['hits', 'inningsPitched', 'baseOnBalls']

    #Creating df for home team slugging
    df_h = pd.DataFrame(columns = ['game_id','whip_10'])

    #Creating df for away team slugging
    df_a = pd.DataFrame(columns = ['game_id','whip_10'])
    

    #Initializing stats for home team
    h_hits = 0
    h_inningsPitched = 0
    h_walks = 0

    #Initializing stats for away team
    a_hits = 0
    a_inningsPitched = 0
    a_walks = 0

    #Getting stats of all players in the bullpen for the home team this game
    #Using hitting, because this includes all starting pitchers, as well as the bullpen, and any possible position player relief.
    for j in game['home_hitting']:
        try:
            #Getting all home team stats
            stat_list = [player_stats(j, start_date, end_date, year)['pitching'][i] for i in whip_stats_list]

            #Changing string to float
            stat_list[1] = float(stat_list[1])

            #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
            if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                stat_list[1] = int(stat_list[1]) + 0.666
            elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                stat_list[1] = int(stat_list[1]) + 0.333

            #Summing stats for each team (players) for each game
            h_hits+= stat_list[0]
            h_inningsPitched+= stat_list[1]
            h_walks+= stat_list[2]
        except: pass

    for j in roster_groups[i]['away_hitting']:
        try:
            #Getting all away team stats
            stat_list = [player_stats(j, start_date, end_date, year)['pitching'][i] for i in whip_stats_list]

            #Changing string to float
            stat_list[1] = float(stat_list[1])

            #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
            if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                stat_list[1] = int(stat_list[1]) + 0.666
            elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                stat_list[1] = int(stat_list[1]) + 0.333

            a_hits+= stat_list[0]
            a_inningsPitched+= stat_list[1]
            a_walks+= stat_list[2]
         
        except: pass
    #Appending stats to their respective dfs
    df_h.loc[len(df_h)] = [game_id,round((h_hits+h_walks)/h_inningsPitched,3)]
    df_a.loc[len(df_a)] = [game_id,round((a_hits+a_walks)/a_inningsPitched,3)]
    #Returning the difference of home and away team stats
    return df_h.set_index('game_id').subtract(df_a.set_index('game_id'), fill_value =0).reset_index()

In [None]:
#Tetsing function - takes 14s 
whip_rolling_10_diff(633899)

## Combining hitting rolling average functions (obp and slug 10)

In [None]:
#Combining obp and slug rolling 10 day averages to one function for efficiency

def obp_slug_roll10_diff(game_id):
#For loop to grab relevant data points for a specific game_id in all 2021 games (roster_by_group)
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:

            game = roster_groups[i]
            
            #Initializing variables to use in the next loop
            end_date = roster_groups[i]['date']

            #year is used in the player_stats function, and is the first 4 characters of the date string
            year = roster_groups[i]['date'][0:4]
            break
    
    #Getting start date - 9 days before the end_date
    start_date = (datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=-9)).strftime('%Y-%m-%d')

    #Creating list of all stats needed to calculate slugging
    obp_slug_stats_list = ['hits', 'doubles','triples', 'homeRuns', 'atBats', 'baseOnBalls', 'hitByPitch', 'plateAppearances']

    #Creating df for home team obp and slugging
    df_h = pd.DataFrame(columns = ['game_id','obp_10','slug_10'])

    #Creating df for away team obp and slugging
    df_a = pd.DataFrame(columns = ['game_id', 'obp_10','slug_10'])
    

    #Initializing stats for home team
    h_hits = 0
    h_doubles = 0
    h_triples = 0
    h_homeRuns = 0
    h_atBats = 0
    h_baseOnBalls = 0
    h_hitByPitch = 0
    h_plateAppearances = 0

    #Initializing stats for away team
    a_hits = 0
    a_doubles = 0
    a_triples = 0
    a_homeRuns = 0
    a_atBats = 0
    a_baseOnBalls = 0
    a_hitByPitch = 0
    a_plateAppearances = 0

    for j in game['home_hitting']:
        try:
            #Storing all stats as a list 
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in obp_slug_stats_list]

            #Adding home team stats together for the 10 days
            h_hits+= stat_list[0]
            h_doubles+= stat_list[1]
            h_triples+= stat_list[2]
            h_homeRuns+= stat_list[3]
            h_atBats+= stat_list[4]
            h_baseOnBalls+= stat_list[5]
            h_hitByPitch+= stat_list[6]
            h_plateAppearances+= stat_list[7]
        except: pass

    for j in game['away_hitting']:
        try:
            #Adding away team stats together for the 10 days
            stat_list = [player_stats(j, start_date, end_date,year)['hitting'][i] for i in obp_slug_stats_list]

            #Adding away team stats together for the 10 days
            a_hits+= stat_list[0]
            a_doubles+= stat_list[1]
            a_triples+= stat_list[2]
            a_homeRuns+= stat_list[3]
            a_atBats+= stat_list[4]
            a_baseOnBalls+= stat_list[5]
            a_hitByPitch+= stat_list[6]
            a_plateAppearances+= stat_list[7]
        except: pass      
    
    #Appending stats to their respective dfs
    df_h.loc[len(df_h)] = [game_id,round((h_hits+h_baseOnBalls+h_hitByPitch)/h_plateAppearances,3),round((h_hits+h_doubles+h_triples*2+h_homeRuns*3)/h_atBats,3)]
    df_a.loc[len(df_a)] = [game_id,round((a_hits+a_baseOnBalls+a_hitByPitch)/a_plateAppearances,3),round((a_hits+a_doubles+a_triples*2+a_homeRuns*3)/a_atBats,3)]

    #Returning the difference of home and away team stats
    return df_h.set_index('game_id').subtract(df_a.set_index('game_id'), fill_value =0).reset_index()

In [None]:
#Testing function - about 30s faster than running each individually (1 min vs 1.5 min)
obp_slug_roll10_diff(633899)

## Function for all hitting stats 

In [None]:
#Seems like it will be inefficient, as they pull stats from different time periods, and just about everything needs to be done twice

def all_hit_stats(game_id):

    #Checking through roster_groups list for the game_id
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:

             #End date - date of the game input
            end_date = roster_groups[i]['date']

            #All players for the game
            game = roster_groups[i]

            #year is used in the player_stats function, and is the first 4 characters of the date string
            year = roster_groups[i]['date'][0:4]

            #breaking loop at this game_id then calculating stats
            break
        
    #Getting start date for rolling 10 day stats - 9 days before the end_date
    rolling_start_date = (datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=-9)).strftime('%Y-%m-%d')

    #Start date for 2021 season - for current stats
    start_date = season_start_end(year)[0]

    #List of stats being pulled from API
    hitting_stats = ['hits', 'baseOnBalls', 'atBats', 'hitByPitch','doubles', 'triples', 'homeRuns', 'plateAppearances']

    #List of stats being calculated
    team_hit_stats = ['game_id','avg', 'obp', 'slug', 'obp_10', 'slug_10']


    #Creating empty df for home team with specified column names
    df_h = pd.DataFrame(columns = team_hit_stats)

    #Initializing home stats variables for static hitting stats
    h_hits = 0
    h_baseOnBalls = 0
    h_atBats = 0
    h_hitByPitch = 0
    h_doubles = 0
    h_triples = 0
    h_homeRuns= 0
    h_plateAppearances = 0

    #Initializing home stats variables for rolling hitting stats
    h_r_hits = 0
    h_r_baseOnBalls = 0
    h_r_atBats = 0
    h_r_hitByPitch = 0
    h_r_doubles = 0
    h_r_triples = 0
    h_r_homeRuns= 0
    h_r_plateAppearances = 0

    #Initializing away stats variables
    df_a = pd.DataFrame(columns = team_hit_stats)

    #Initializing away stats variables for static hitting stats
    a_hits = 0
    a_baseOnBalls = 0
    a_atBats = 0
    a_hitByPitch = 0
    a_doubles = 0
    a_triples = 0
    a_homeRuns= 0
    a_plateAppearances = 0

    #Initializing away stats variables for rolling hitting stats
    a_r_hits = 0
    a_r_baseOnBalls = 0
    a_r_atBats = 0
    a_r_hitByPitch = 0
    a_r_doubles = 0
    a_r_triples = 0
    a_r_homeRuns= 0
    a_r_plateAppearances = 0

    #Getting stats for each player on the home team
    for j in game['home_hitting']:
        try:
            #Storing all static stats as a list
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the home team static stats together for the time period
            h_hits+= stat_list[0]
            h_baseOnBalls+= stat_list[1]
            h_atBats+= stat_list[2]
            h_hitByPitch+= stat_list[3]
            h_doubles+= stat_list[4]
            h_triples+= stat_list[5]
            h_homeRuns+= stat_list[6]
            h_plateAppearances+= stat_list[7]
        except: pass

        try:
            #Storing all rolling stats as a list
            stat_list_10 = [player_stats(j, rolling_start_date, end_date, year)['hitting'][i] for i in hitting_stats]
            
            #Adding the home team rolling stats together for the time period
            h_r_hits+= stat_list_10[0]
            h_r_baseOnBalls+= stat_list_10[1]
            h_r_atBats+= stat_list_10[2]
            h_r_hitByPitch+= stat_list_10[3]
            h_r_doubles+= stat_list_10[4]
            h_r_triples+= stat_list_10[5]
            h_r_homeRuns+= stat_list_10[6]
            h_r_plateAppearances+= stat_list_10[7]
        except: pass

    #Getting stats for each player on the away team
    for j in game['away_hitting']:
        try:
            #Storing all static stats as a list
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the away team stats together for the time period
            a_hits+= stat_list[0]
            a_baseOnBalls+= stat_list[1]
            a_atBats+= stat_list[2]
            a_hitByPitch+= stat_list[3]
            a_doubles+= stat_list[4]
            a_triples+= stat_list[5]
            a_homeRuns+= stat_list[6]
            a_plateAppearances+= stat_list[7]
        except: pass
        
        try:
            #Storing all rolling stats as a list
            stat_list_10 = [player_stats(j, rolling_start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the away team rolling stats together for the time period
            a_r_hits+= stat_list_10[0]
            a_r_baseOnBalls+= stat_list_10[1]
            a_r_atBats+= stat_list_10[2]
            a_r_hitByPitch+= stat_list_10[3]
            a_r_doubles+= stat_list_10[4]
            a_r_triples+= stat_list_10[5]
            a_r_homeRuns+= stat_list_10[6]
            a_r_plateAppearances+= stat_list_10[7]
        except: pass
    #appending stats to their respective dfs
    df_h.loc[len(df_h)] = [game_id,round(h_hits/h_atBats,3),round((h_hits+h_baseOnBalls+h_hitByPitch)/h_plateAppearances,3),round((h_hits+h_doubles+h_triples*2+h_homeRuns*3)/h_atBats,3),round((h_r_hits+h_r_baseOnBalls+h_r_hitByPitch)/h_r_plateAppearances,3),round((h_r_hits+h_r_doubles+h_r_triples*2+h_r_homeRuns*3)/h_r_atBats,3)]
    df_a.loc[len(df_a)] = [game_id,round(a_hits/a_atBats,3),round((a_hits+a_baseOnBalls+a_hitByPitch)/a_plateAppearances,3),round((a_hits+a_doubles+a_triples*2+a_homeRuns*3)/a_atBats,3),round((a_r_hits+a_r_baseOnBalls+a_r_hitByPitch)/a_r_plateAppearances,3),round((a_r_hits+a_r_doubles+a_r_triples*2+a_r_homeRuns*3)/a_r_atBats,3)]
    #Returning the difference of home and away team stats
    return df_h.set_index('game_id').subtract(df_a.set_index('game_id'), fill_value =0).reset_index()

In [None]:
#Testing - took almost 3 mins
all_hit_stats(634224)

In [None]:
#Function to merge all hitting stats to a df
#Takes game_id and year as an input - could remove year from hitting_stats_diff, but it runs a bit slower this way
#Returns a df of home team- away team hitting stats for the game_id

def hit_stats_merge(game_id, year):
    df1 = hitting_stats_diff(game_id,year)
    df2 = obp_rolling_10_diff(game_id)
    df3 = slug_rolling_10_diff(game_id)
    #define list of DataFrames
    dfs = [df1, df2, df3]

    #merge all DataFrames into one
    final_df = reduce(lambda left,right: pd.merge(left,right,on=['game_id'],
                                            how='outer'), dfs)
    return final_df

## Testing the timing of hits_stats_merge and all_hit_stats

In [None]:
#Looking at first 5 games
game_list = [roster_groups[i]['game_id'] for i in range(5)]

for i in game_list:
    print(hit_stats_merge(i,2021))


In [None]:
#Looking at first 5 games
game_list = [roster_groups[i]['game_id'] for i in range(5)]

for i in game_list:
    print(all_hit_stats(i))

In [None]:
## Testing for 10 games

## Adjusting functions to take two dfs as input

In [None]:
#Creating empty dfs for home and away teams with specified column names

#Name of columns
team_hit_stats = ['game_id','avg', 'obp', 'slug', 'obp_10', 'slug_10']

#dfs
df_home = pd.DataFrame(columns = team_hit_stats)
df_away = pd.DataFrame(columns = team_hit_stats)

In [None]:
#Fucntion the combines all hitting stats and appends it to a df

def all_hit_stats_df(game_id, df_home, df_away):

    #Checking through roster_groups list for the game_id
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:

             #End date - date of the game input
            end_date = roster_groups[i]['date']

            #All players for the game
            game = roster_groups[i]

            #year is used in the player_stats function, and is the first 4 characters of the date string
            year = roster_groups[i]['date'][0:4]

            #breaking loop at this game_id then calculating stats
            break
        
    #Getting start date for rolling 10 day stats - 9 days before the end_date
    rolling_start_date = (datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=-9)).strftime('%Y-%m-%d')

    #Start date for 2021 season - for current stats
    start_date = season_start_end(year)[0]

    #List of stats being pulled from API
    hitting_stats = ['hits', 'baseOnBalls', 'atBats', 'hitByPitch','doubles', 'triples', 'homeRuns', 'plateAppearances']

    #Initializing home stats variables for static hitting stats
    h_hits = 0
    h_baseOnBalls = 0
    h_atBats = 0
    h_hitByPitch = 0
    h_doubles = 0
    h_triples = 0
    h_homeRuns= 0
    h_plateAppearances = 0

    #Initializing home stats variables for rolling hitting stats
    h_r_hits = 0
    h_r_baseOnBalls = 0
    h_r_atBats = 0
    h_r_hitByPitch = 0
    h_r_doubles = 0
    h_r_triples = 0
    h_r_homeRuns= 0
    h_r_plateAppearances = 0

    #Initializing away stats variables for static hitting stats
    a_hits = 0
    a_baseOnBalls = 0
    a_atBats = 0
    a_hitByPitch = 0
    a_doubles = 0
    a_triples = 0
    a_homeRuns= 0
    a_plateAppearances = 0

    #Initializing away stats variables for rolling hitting stats
    a_r_hits = 0
    a_r_baseOnBalls = 0
    a_r_atBats = 0
    a_r_hitByPitch = 0
    a_r_doubles = 0
    a_r_triples = 0
    a_r_homeRuns= 0
    a_r_plateAppearances = 0

    #Getting stats for each player on the home team
    for j in game['home_hitting']:
        try:
            #Storing all static stats as a list
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the home team static stats together for the time period
            h_hits+= stat_list[0]
            h_baseOnBalls+= stat_list[1]
            h_atBats+= stat_list[2]
            h_hitByPitch+= stat_list[3]
            h_doubles+= stat_list[4]
            h_triples+= stat_list[5]
            h_homeRuns+= stat_list[6]
            h_plateAppearances+= stat_list[7]
        except: pass

        try:
            #Storing all rolling stats as a list
            stat_list_10 = [player_stats(j, rolling_start_date, end_date, year)['hitting'][i] for i in hitting_stats]
            
            #Adding the home team rolling stats together for the time period
            h_r_hits+= stat_list_10[0]
            h_r_baseOnBalls+= stat_list_10[1]
            h_r_atBats+= stat_list_10[2]
            h_r_hitByPitch+= stat_list_10[3]
            h_r_doubles+= stat_list_10[4]
            h_r_triples+= stat_list_10[5]
            h_r_homeRuns+= stat_list_10[6]
            h_r_plateAppearances+= stat_list_10[7]
        except: pass

    #Getting stats for each player on the away team
    for j in game['away_hitting']:
        try:
            #Storing all static stats as a list
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the away team stats together for the time period
            a_hits+= stat_list[0]
            a_baseOnBalls+= stat_list[1]
            a_atBats+= stat_list[2]
            a_hitByPitch+= stat_list[3]
            a_doubles+= stat_list[4]
            a_triples+= stat_list[5]
            a_homeRuns+= stat_list[6]
            a_plateAppearances+= stat_list[7]
        except: pass
        
        try:
            #Storing all rolling stats as a list
            stat_list_10 = [player_stats(j, rolling_start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the away team rolling stats together for the time period
            a_r_hits+= stat_list_10[0]
            a_r_baseOnBalls+= stat_list_10[1]
            a_r_atBats+= stat_list_10[2]
            a_r_hitByPitch+= stat_list_10[3]
            a_r_doubles+= stat_list_10[4]
            a_r_triples+= stat_list_10[5]
            a_r_homeRuns+= stat_list_10[6]
            a_r_plateAppearances+= stat_list_10[7]
        except: pass
    #appending stats to their respective dfs
    df_home.loc[len(df_home)] = [game_id,round(h_hits/h_atBats,3),round((h_hits+h_baseOnBalls+h_hitByPitch)/h_plateAppearances,3),round((h_hits+h_doubles+h_triples*2+h_homeRuns*3)/h_atBats,3),round((h_r_hits+h_r_baseOnBalls+h_r_hitByPitch)/h_r_plateAppearances,3),round((h_r_hits+h_r_doubles+h_r_triples*2+h_r_homeRuns*3)/h_r_atBats,3)]
    df_away.loc[len(df_away)] = [game_id,round(a_hits/a_atBats,3),round((a_hits+a_baseOnBalls+a_hitByPitch)/a_plateAppearances,3),round((a_hits+a_doubles+a_triples*2+a_homeRuns*3)/a_atBats,3),round((a_r_hits+a_r_baseOnBalls+a_r_hitByPitch)/a_r_plateAppearances,3),round((a_r_hits+a_r_doubles+a_r_triples*2+a_r_homeRuns*3)/a_r_atBats,3)]
    #Returning the difference of home and away team stats
    return df_home.set_index('game_id').subtract(df_away.set_index('game_id'), fill_value =0).reset_index()

In [None]:
#Testing with first 3 games - took 5.5 - 6.5 mins
game_list = [roster_groups[i]['game_id'] for i in range(3)]

for i in game_list:
    new_df = all_hit_stats_df(i, df_home, df_away)

new_df


## Testing returning a list and appending that to a df

In [None]:
#Fucntion the combines all hitting stats and appends it to a df

def all_hit_stats_list(game_id):

    #Checking through roster_groups list for the game_id
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:

             #End date - date of the game input
            end_date = roster_groups[i]['date']

            #All players for the game
            game = roster_groups[i]

            #year is used in the player_stats function, and is the first 4 characters of the date string
            year = roster_groups[i]['date'][0:4]

            #breaking loop at this game_id then calculating stats
            break
        
    #Getting start date for rolling 10 day stats - 9 days before the end_date
    rolling_start_date = (datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=-9)).strftime('%Y-%m-%d')

    #Start date for 2021 season - for current stats
    start_date = season_start_end(year)[0]

    #List of stats being pulled from API
    hitting_stats = ['hits', 'baseOnBalls', 'atBats', 'hitByPitch','doubles', 'triples', 'homeRuns', 'plateAppearances']

    #Initializing home stats variables for static hitting stats
    h_hits = 0
    h_baseOnBalls = 0
    h_atBats = 0
    h_hitByPitch = 0
    h_doubles = 0
    h_triples = 0
    h_homeRuns= 0
    h_plateAppearances = 0

    #Initializing home stats variables for rolling hitting stats
    h_r_hits = 0
    h_r_baseOnBalls = 0
    h_r_atBats = 0
    h_r_hitByPitch = 0
    h_r_doubles = 0
    h_r_triples = 0
    h_r_homeRuns= 0
    h_r_plateAppearances = 0

    #Initializing away stats variables for static hitting stats
    a_hits = 0
    a_baseOnBalls = 0
    a_atBats = 0
    a_hitByPitch = 0
    a_doubles = 0
    a_triples = 0
    a_homeRuns= 0
    a_plateAppearances = 0

    #Initializing away stats variables for rolling hitting stats
    a_r_hits = 0
    a_r_baseOnBalls = 0
    a_r_atBats = 0
    a_r_hitByPitch = 0
    a_r_doubles = 0
    a_r_triples = 0
    a_r_homeRuns= 0
    a_r_plateAppearances = 0

    #Getting stats for each player on the home team
    for j in game['home_hitting']:
        try:
            #Storing all static stats as a list
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the home team static stats together for the time period
            h_hits+= stat_list[0]
            h_baseOnBalls+= stat_list[1]
            h_atBats+= stat_list[2]
            h_hitByPitch+= stat_list[3]
            h_doubles+= stat_list[4]
            h_triples+= stat_list[5]
            h_homeRuns+= stat_list[6]
            h_plateAppearances+= stat_list[7]
        except: pass

        try:
            #Storing all rolling stats as a list
            stat_list_10 = [player_stats(j, rolling_start_date, end_date, year)['hitting'][i] for i in hitting_stats]
            
            #Adding the home team rolling stats together for the time period
            h_r_hits+= stat_list_10[0]
            h_r_baseOnBalls+= stat_list_10[1]
            h_r_atBats+= stat_list_10[2]
            h_r_hitByPitch+= stat_list_10[3]
            h_r_doubles+= stat_list_10[4]
            h_r_triples+= stat_list_10[5]
            h_r_homeRuns+= stat_list_10[6]
            h_r_plateAppearances+= stat_list_10[7]
        except: pass

    #Getting stats for each player on the away team
    for j in game['away_hitting']:
        try:
            #Storing all static stats as a list
            stat_list = [player_stats(j, start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the away team stats together for the time period
            a_hits+= stat_list[0]
            a_baseOnBalls+= stat_list[1]
            a_atBats+= stat_list[2]
            a_hitByPitch+= stat_list[3]
            a_doubles+= stat_list[4]
            a_triples+= stat_list[5]
            a_homeRuns+= stat_list[6]
            a_plateAppearances+= stat_list[7]
        except: pass
        
        try:
            #Storing all rolling stats as a list
            stat_list_10 = [player_stats(j, rolling_start_date, end_date, year)['hitting'][i] for i in hitting_stats]

            #Adding the away team rolling stats together for the time period
            a_r_hits+= stat_list_10[0]
            a_r_baseOnBalls+= stat_list_10[1]
            a_r_atBats+= stat_list_10[2]
            a_r_hitByPitch+= stat_list_10[3]
            a_r_doubles+= stat_list_10[4]
            a_r_triples+= stat_list_10[5]
            a_r_homeRuns+= stat_list_10[6]
            a_r_plateAppearances+= stat_list_10[7]
        except: pass
    #appending stats to their respective dfs
    diff_list = list(np.round(np.array([round(h_hits/h_atBats,3),round((h_hits+h_baseOnBalls+h_hitByPitch)/h_plateAppearances,3),round((h_hits+h_doubles+h_triples*2+h_homeRuns*3)/h_atBats,3),round((h_r_hits+h_r_baseOnBalls+h_r_hitByPitch)/h_r_plateAppearances,3),round((h_r_hits+h_r_doubles+h_r_triples*2+h_r_homeRuns*3)/h_r_atBats,3)]) - np.array([round(a_hits/a_atBats,3),round((a_hits+a_baseOnBalls+a_hitByPitch)/a_plateAppearances,3),round((a_hits+a_doubles+a_triples*2+a_homeRuns*3)/a_atBats,3),round((a_r_hits+a_r_baseOnBalls+a_r_hitByPitch)/a_r_plateAppearances,3),round((a_r_hits+a_r_doubles+a_r_triples*2+a_r_homeRuns*3)/a_r_atBats,3)]),3))
    diff_list.insert(0, game_id)
    #Returning the difference of home and away team stats
    return diff_list

##### Testing list to df 

In [None]:
df_tester = pd.DataFrame(columns = team_hit_stats)
df_tester

In [None]:
#Took 6.5mins
for i in game_list:
    df_tester.loc[len(df_tester)] = all_hit_stats_list(i)
df_tester

##### Comparing list function and df function times

In [None]:
#Testing with 4,5,6th games
game_list = [roster_groups[i]['game_id'] for i in range(4,7)]

In [None]:
#Took 6.5 mins
for i in game_list:
    new_df = all_hit_stats_df(i, df_home, df_away)
new_df

In [None]:
#Took 6.5mins
for i in game_list:
    df_tester.loc[len(df_tester)] = all_hit_stats_list(i)
df_tester

## Fixing pitching stats functions to take two dfs as an input

##### Also removing year dependence

In [None]:
#Creating function to calculate all pitchings tats

#Function takes the game_id and home/away team dfs as inputs
#returns a df of home team - away team stats

def all_pitch_stats_diff(game_id, df_home, df_away):

    #List of pitching stats used in calculations
    pitching_stats = ['runs', 'inningsPitched', 'baseOnBalls', 'hits']

    #List of pitching stats being calculated
    team_pitch_stats = ['game_id','era_sp', 'whip_sp', 'era_bp', 'whip_bp', 'whip_10']

    #List of stats for rolling whip average
    whip_stats_list = ['hits', 'inningsPitched', 'baseOnBalls']

    #Checking for the inputted game id in the list of dicts of games
    for i in range(len(roster_groups)):
        if roster_groups[i]['game_id'] == game_id:

            #storing game roster data
            game = roster_groups[i]

            #Date of the game
            end_date = roster_groups[i]['date']

            #Getting home starting pitcher
            h_sp = roster_groups[i]['home_sp']

            #Getting away starting pitcher
            a_sp = roster_groups[i]['away_sp']

            #year is used in the player_stats function, and is the first 4 characters of the date string
            year = roster_groups[i]['date'][0:4]
            break

    #Initializing home team static stats variables
    h_runs = 0
    h_inningsPitched = 0
    h_walks = 0
    h_hits = 0

    #Initializing away team static stats variables
    a_runs = 0
    a_inningsPitched = 0
    a_walks = 0
    a_hits = 0

    #Initializing home team rolling stats variables
    h_r_hits = 0
    h_r_inningsPitched = 0
    h_r_walks = 0

    #Initializing away team rolling stats variables
    a_r_hits = 0
    a_r_inningsPitched = 0
    a_r_walks = 0

    #Getting start date of the season
    start_date = season_start_end(year)[0]

    #Start date for the 10 day rolling whip
    rolling_start_date = (datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=-9)).strftime('%Y-%m-%d')
     
    #Getting stats of all players in the bullpen for the home team this game
    for j in game['home_bullpen']:
        try:
            #Getting all home team stats
            stat_list = [player_stats(j, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

            #Changing string to float
            stat_list[1] = float(stat_list[1])

            #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
            if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                stat_list[1] = int(stat_list[1]) + 0.666
            elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                stat_list[1] = int(stat_list[1]) + 0.333

            #Summing stats for each team (players) for each game
            h_runs+= stat_list[0]
            h_inningsPitched+= stat_list[1]
            h_walks+= stat_list[2]
            h_hits+= stat_list[3]
        except: pass
    for j in game['home_hitting']:
        try:
            #Getting all home team stats for whip
            stat_list_10 = [player_stats(j, rolling_start_date, end_date, year)['pitching'][i] for i in whip_stats_list]

            #Changing string to float
            stat_list_10[1] = float(stat_list_10[1])

            #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
            if round(stat_list_10[1] - int(stat_list_10[1]),1) == 0.2:
                stat_list_10[1] = int(stat_list_10[1]) + 0.666
            elif round(stat_list_10[1] - int(stat_list_10[1]),1) == 0.1:
                stat_list_10[1] = int(stat_list_10[1]) + 0.333

            #Summing stats for each team (players) for each game
            h_r_hits+= stat_list_10[0]
            h_r_inningsPitched+= stat_list_10[1]
            h_r_walks+= stat_list_10[2]

        except: pass
        
    #Getting stats for the starting pitcher
    stat_list = [player_stats(h_sp, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

    #Changing string to float
    stat_list[1] = float(stat_list[1])

    #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
    if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
        stat_list[1] = int(stat_list[1]) + 0.666
    elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
            stat_list[1] = int(stat_list[1]) + 0.333

    #Summing stats for starting pitcher
    h_sp_runs = stat_list[0]
    h_sp_inningsPitched = stat_list[1]
    h_sp_walks = stat_list[2]
    h_sp_hits = stat_list[3]

    for j in game['away_bullpen']:
        try:
            #Getting all away team stats
            stat_list = [player_stats(j, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

            #Changing string to float
            stat_list[1] = float(stat_list[1])

            #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
            if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
                stat_list[1] = int(stat_list[1]) + 0.666
            elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
                stat_list[1] = int(stat_list[1]) + 0.333

            a_runs+= stat_list[0]
            a_inningsPitched+= stat_list[1]
            a_walks+= stat_list[2]
            a_hits+= stat_list[3]
        except: pass
    for j in game['away_hitting']:
        try:
            #Getting all away team stats for whip
            stat_list_10 = [player_stats(j, rolling_start_date, end_date, year)['pitching'][i] for i in whip_stats_list]

            #Changing string to float
            stat_list_10[1] = float(stat_list_10[1])

            #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
            if round(stat_list_10[1] - int(stat_list_10[1]),1) == 0.2:
                stat_list_10[1] = int(stat_list_10[1]) + 0.666
            elif round(stat_list_10[1] - int(stat_list_10[1]),1) == 0.1:
                stat_list_10[1] = int(stat_list_10[1]) + 0.333

            #Summing stats for each team (players) for each game
            a_r_hits+= stat_list_10[0]
            a_r_inningsPitched+= stat_list_10[1]
            a_r_walks+= stat_list_10[2]
        except: pass

        stat_list = [player_stats(a_sp, start_date, end_date, year)['pitching'][i] for i in pitching_stats]

        #Changing string to float
        stat_list[1] = float(stat_list[1])

        #Checking if innings pitched ends in a 0.1 or 0.2 and changing this to the proper decimal for 1/3 or 2/3.
        if round(stat_list[1] - int(stat_list[1]),1) == 0.2:
            stat_list[1] = int(stat_list[1]) + 0.666
        elif round(stat_list[1] - int(stat_list[1]),1) == 0.1:
            stat_list[1] = int(stat_list[1]) + 0.333

        a_sp_runs = stat_list[0]
        a_sp_inningsPitched = stat_list[1]
        a_sp_walks = stat_list[2]
        a_sp_hits = stat_list[3]
    #appending stats to the df
    df_home.loc[len(df_home)] = [game_id,round(h_sp_runs*9/h_sp_inningsPitched,3),round((h_sp_hits+h_sp_walks)/h_sp_inningsPitched,3),round(h_runs*9/h_inningsPitched,3),round((h_hits+h_walks)/h_inningsPitched,3),round((h_r_hits+h_r_walks)/h_r_inningsPitched,3)]
    df_away.loc[len(df_away)] = [game_id,round(a_sp_runs*9/a_sp_inningsPitched,3),round((a_sp_hits+a_sp_walks)/a_sp_inningsPitched,3),round(a_runs*9/a_inningsPitched,3),round((a_hits+a_walks)/a_inningsPitched,3),round((a_r_hits+a_r_walks)/a_r_inningsPitched,3)]
    return df_home.set_index('game_id').subtract(df_away.set_index('game_id'), fill_value =0).reset_index()

##### Testing function timings

In [None]:
#df for tesitng
df_pitching_test_1 = pd.DataFrame(columns = ['game_id','era_sp', 'whip_sp', 'era_bp', 'whip_bp', 'whip_10'])
df_pitching_test_2 = pd.DataFrame(columns = ['game_id','era_sp', 'whip_sp', 'era_bp', 'whip_bp', 'whip_10'])

In [None]:
#testing function takes around 45-50s
all_pitch_stats_diff(633899, df_pitching_test_1, df_pitching_test_2)

In [None]:
whip_rolling_10_diff(633899)

In [None]:
pitching_stats_diff(633899,2021)

## Merging pitching functions

In [None]:
#Function to merge all hitting stats to a df
#Takes game_id and year as an input - could remove year from hitting_stats_diff, but it runs a bit slower this way
#Returns a df of home team- away team hitting stats for the game_id

def pitch_stats_merge(game_id):
    df1 = pitching_stats_diff(game_id,2021)
    df2 = whip_rolling_10_diff(game_id)
    #define list of DataFrames
    dfs = [df1, df2]

    #merge all DataFrames into one
    final_df = reduce(lambda left,right: pd.merge(left,right,on=['game_id'],
                                            how='outer'), dfs)
    return final_df

In [None]:
#Takes around 1min to run
pitch_stats_merge(634224)

## Merging All stat functions