# Creating a data set for model training from 2021 season data

In [3]:
#Imports
import statsapi
import datetime 
import pandas as pd
import json

In [4]:
#Getting start date and end date of a season
def season_start_end(year):
    season_info = statsapi.get("seasons", {"sportId": 1, "season": year})
    season_start = season_info['seasons'][0]['regularSeasonStartDate']
    season_end = season_info['seasons'][0]['regularSeasonEndDate']
    return [season_start, season_end]

In [5]:
start_end_2021 = season_start_end(2021)
season_start = start_end_2021[0]
season_end = start_end_2021[1]

In [6]:
#Adding 35 days to the season start, when I want to start collecting data
season_start_offset = (datetime.datetime.strptime(season_start, "%Y-%m-%d") + datetime.timedelta(days=35)).strftime('%Y-%m-%d')

In [7]:
#Geting all season matchup outcomes
def season_match_outcomes(season_start, season_end):
    dict_list = []
    for i in statsapi.schedule(start_date=season_start, end_date=season_end):
        match_dict = {}
        try:
            match_dict['game_id'] = i['game_id']
            match_dict['win'] = i['winning_team']
            match_dict['loss'] = i['losing_team']
            dict_list.append(match_dict)
        except: pass
    return dict_list

In [8]:
#Setting variable to the output for season match outcomes
all_matches = season_match_outcomes(season_start_offset, season_end)

In [9]:
#Creating a list of all game ids for period in 2021 season
game_ids = [i['game_id'] for i in all_matches]

In [10]:
game_info = statsapi.boxscore_data(632520, timecode=None)
game_info

{'gameId': '2021/09/13/sdnmlb-sfnmlb-1',
 'teamInfo': {'away': {'id': 135,
   'abbreviation': 'SD',
   'teamName': 'Padres',
   'shortName': 'San Diego'},
  'home': {'id': 137,
   'abbreviation': 'SF',
   'teamName': 'Giants',
   'shortName': 'San Francisco'}},
 'playerInfo': {'ID624428': {'id': 624428,
   'fullName': 'Adam Frazier',
   'boxscoreName': 'Frazier'},
  'ID543339': {'id': 543339,
   'fullName': 'Daniel Hudson',
   'boxscoreName': 'Hudson, D'},
  'ID595777': {'id': 595777,
   'fullName': 'Jurickson Profar',
   'boxscoreName': 'Profar'},
  'ID642731': {'id': 642731,
   'fullName': 'Thairo Estrada',
   'boxscoreName': 'Estrada'},
  'ID605397': {'id': 605397,
   'fullName': 'Joe Musgrove',
   'boxscoreName': 'Musgrove'},
  'ID657277': {'id': 657277, 'fullName': 'Logan Webb', 'boxscoreName': 'Webb'},
  'ID489334': {'id': 489334,
   'fullName': 'Craig Stammen',
   'boxscoreName': 'Stammen'},
  'ID500779': {'id': 500779,
   'fullName': 'Jose Quintana',
   'boxscoreName': 'Quintan

## Creating json of all the 2021 matchups and rosters

In [11]:
#Takes a long time to run (16min) - There was a better way I am sure, a df would have been better, but using this for now
#Commenting out to not run again
#dict_list = []
#for i in game_ids:
#    game_info = statsapi.boxscore_data(i, timecode=None)
#    roster_dict = {}
#    roster_dict['game_id'] = int(i)
#    roster_dict['home_roster'] = game_info['home']['battingOrder'] + game_info['home']['bullpen']
#    roster_dict['away_roster'] = game_info['away']['battingOrder'] + game_info['away']['bullpen']
#    dict_list.append(roster_dict)
#dict_list

In [12]:
#Creating json from above dict_list
#Commenting out to not run again
#with open('roster_dict', 'w') as fout:
#    json.dump(dict_list, fout)

##### Adding date to data in a new json file
###### Remembered a bit later I needed the date to use my stats function 

In [13]:
#Takes a long time to run (16min) - There was a better way I am sure, a df would have been better, but using this for now
#Commenting out to not run again
#dict_list = []
#for i in game_ids:
#    game_info = statsapi.boxscore_data(i, timecode=None)
#    roster_dict = {}
#    roster_dict['game_id'] = int(i)
#    roster_dict['date'] = game_info['gameId'][0:10].replace('/', '-')
#    roster_dict['home_roster'] = game_info['home']['battingOrder'] + game_info['home']['bullpen']
#    roster_dict['away_roster'] = game_info['away']['battingOrder'] + game_info['away']['bullpen']
#    dict_list.append(roster_dict)
#dict_list

In [14]:
#Creating json from above dict_list
#Commenting out to not run again
#with open('roster_date_dict', 'w') as fout:
#    json.dump(dict_list, fout)

## Creating Player Stats function

In [15]:
#Creation of this function further down -  need to use it in the stats function

#Function to sum values in two dictionaries
def add_dicts(d1, d2):
    d3 = dict(d1)
    d3.update(d2)
    try:
        for i, j in d1.items():

            for x, y in d2.items():

                if (i == x):

                    d3[i]=(j+y)
    except: pass
    return d3

##### Keeping for reference for now
def player_stats(id,start_date, end_date, season):
    player_info = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[hitting,fielding,pitching],type=[byDateRange],\
    startDate={start_date},endDate={end_date},season={season})"})
    #dict where I will add all the collected stats
    stat_dict = {}
    #Temporary storage for fielding dicts for all positions
    fielding_list = []
    #List to check if the stats for a fielding position have already been collected - as they are listed more than once
    pos_codes = []
    try:
        player_hitting_stats = player_info['people'][0]['stats'][0]['splits'][0]['stat']
        stat_dict['hitting'] = player_hitting_stats
    except: pass
    try: 
        for i in player_info['people'][0]['stats'][1]['splits']:
            if player_info['people'][0]['stats'][1]['splits'][i]['stat']['position']['code'].isin(pos_codes):
                pass
            else:
                pos_codes.append(player_info['people'][0]['stats'][1]['splits'][i]['stat']['position']['code'])
                fielding_list.append(player_info['people'][0]['stats'][i]['splits'][0]['stat']) 
        all_fielding = {}
        for i in fielding_list:
            all_fielding.update(i)

            stat_dict['fielding'] = 1 #Placeholder
    except: pass
    try:
        player_pitching_stats = player_info['people'][0]['stats'][2]['splits'][0]['stat']
        stat_dict['pitching'] = player_pitching_stats
    except:pass
    return stat_dict

In [16]:
#Create function to get player stats
#Inputs are the id, start date and end date

#Currently not getting all the fielding stats, as there are more than one set of them - based on the pos they play - fix in progress


def player_stats(id,start_date, end_date, season):

    #Getting player stats - separately because the API switches their order when fetched at the same time 
    hitting_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[hitting],type=[byDateRange],\
    startDate={start_date},endDate={end_date},season={season})"})
    pitching_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[pitching],type=[byDateRange],\
    startDate={start_date},endDate={end_date},season={season})"})
    fielding_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[fielding],type=[byDateRange],\
    startDate={start_date},endDate={end_date},season={season})"})

    #dict where I will add all the collected stats
    stat_dict = {}

    #Temporary storage for fielding dicts for all positions
    fielding_list = []

    #List to check if the stats for a fielding position have already been collected - as they are listed more than once
    pos_codes = []

    try:
        player_hitting_stats = hitting_stats['people'][0]['stats'][0]['splits'][0]['stat']
        stat_dict['hitting'] = player_hitting_stats
    except: pass
    
    try:
        player_pitching_stats = pitching_stats['people'][0]['stats'][0]['splits'][-1]['stat']
        stat_dict['pitching'] = player_pitching_stats
    except: pass

    try:
        for i in range(len(fielding_stats['people'][0]['stats'][0]['splits'])):
            if fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']['position']['code'] in pos_codes:
                pass
            else:
                pos_codes.append(fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']['position']['code'])
                fielding_list.append(fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']) 
                
        all_fielding = {}

        for i in range(len(fielding_list)):
            all_fielding = add_dicts(all_fielding, fielding_list[i])
        stat_dict['fielding'] = all_fielding
        
        
    except:pass
    return stat_dict

In [17]:
#Testing function for 2022
player_stats(518934,'2022-04-05','2022-08-12', 2022)

{'hitting': {'gamesPlayed': 103,
  'groundOuts': 139,
  'airOuts': 83,
  'runs': 71,
  'doubles': 18,
  'triples': 0,
  'homeRuns': 12,
  'strikeOuts': 56,
  'baseOnBalls': 61,
  'intentionalWalks': 0,
  'hits': 109,
  'hitByPitch': 4,
  'avg': '.282',
  'atBats': 386,
  'obp': '.385',
  'slg': '.422',
  'ops': '.807',
  'caughtStealing': 3,
  'stolenBases': 4,
  'stolenBasePercentage': '.571',
  'groundIntoDoublePlay': 8,
  'groundIntoTriplePlay': 0,
  'numberOfPitches': 1834,
  'plateAppearances': 452,
  'totalBases': 163,
  'rbi': 43,
  'leftOnBase': 127,
  'sacBunts': 0,
  'sacFlies': 1,
  'babip': '.304',
  'groundOutsToAirouts': '1.67',
  'catchersInterference': 0,
  'atBatsPerHomeRun': '32.17'},
 'fielding': {'gamesPlayed': 109,
  'gamesStarted': 95,
  'assists': 191,
  'putOuts': 234,
  'errors': 2,
  'chances': 427,
  'fielding': '1.000.995.000.992',
  'position': {'code': '5',
   'name': 'Third Base',
   'type': 'Infielder',
   'abbreviation': '3B'},
  'rangeFactorPerGame': '

In [18]:
#Testing player stats for a pitcher - Found I needed to use [-1] in my function to get the cumulative pitching stats
player_stats(593974,'2021-04-01', '2021-05-06', 2021)

{'hitting': {'gamesPlayed': 9,
  'groundOuts': 0,
  'airOuts': 0,
  'runs': 0,
  'doubles': 0,
  'triples': 0,
  'homeRuns': 0,
  'strikeOuts': 0,
  'baseOnBalls': 0,
  'intentionalWalks': 0,
  'hits': 0,
  'hitByPitch': 0,
  'avg': '.000',
  'atBats': 0,
  'obp': '.000',
  'slg': '.000',
  'ops': '.000',
  'caughtStealing': 0,
  'stolenBases': 0,
  'stolenBasePercentage': '.---',
  'groundIntoDoublePlay': 0,
  'groundIntoTriplePlay': 0,
  'numberOfPitches': 0,
  'plateAppearances': 0,
  'totalBases': 0,
  'rbi': 0,
  'leftOnBase': 0,
  'sacBunts': 0,
  'sacFlies': 0,
  'babip': '.---',
  'groundOutsToAirouts': '-.--',
  'catchersInterference': 0,
  'atBatsPerHomeRun': '-.--'},
 'pitching': {'gamesPlayed': 13,
  'gamesStarted': 0,
  'groundOuts': 12,
  'airOuts': 7,
  'runs': 5,
  'doubles': 0,
  'triples': 1,
  'homeRuns': 1,
  'strikeOuts': 11,
  'baseOnBalls': 3,
  'intentionalWalks': 0,
  'hits': 12,
  'hitByPitch': 0,
  'avg': '.293',
  'atBats': 41,
  'obp': '.333',
  'slg': '.41

#### Various tests and examples trying to figure out how to get the above function working

In [19]:
#Checking length of fielding_stats
fielding_stats = statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})
len(fielding_stats['people'][0]['stats'][0]['splits'])

8

In [20]:
#Verifying location of position code for looping
statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})['people'][0]['stats'][0]['splits'][0]['stat']['position']['code']

'4'

In [21]:
#Verifying position of one player's single position fielding stats
statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[hitting,fielding,pitching],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})['people'][0]['stats'][1]['splits'][0]['stat']

{'gamesPlayed': 103,
 'groundOuts': 139,
 'airOuts': 83,
 'runs': 71,
 'doubles': 18,
 'triples': 0,
 'homeRuns': 12,
 'strikeOuts': 56,
 'baseOnBalls': 61,
 'intentionalWalks': 0,
 'hits': 109,
 'hitByPitch': 4,
 'avg': '.282',
 'atBats': 386,
 'obp': '.385',
 'slg': '.422',
 'ops': '.807',
 'caughtStealing': 3,
 'stolenBases': 4,
 'stolenBasePercentage': '.571',
 'groundIntoDoublePlay': 8,
 'groundIntoTriplePlay': 0,
 'numberOfPitches': 1834,
 'plateAppearances': 452,
 'totalBases': 163,
 'rbi': 43,
 'leftOnBase': 127,
 'sacBunts': 0,
 'sacFlies': 1,
 'babip': '.304',
 'groundOutsToAirouts': '1.67',
 'catchersInterference': 0,
 'atBatsPerHomeRun': '32.17'}

In [22]:
#Looking at stats for Javier Baez who switched teams mid season.  Need to use abbreviation == All to get his stats, and combine them.
statsapi.get("people", {"personIds": 595879, "hydrate": "stats(group=[hitting,pitching,fielding],type=[byDateRange],\
    startDate=2021-04-05,endDate=2021-10-01,season=2021)"})

{'copyright': 'Copyright 2022 MLB Advanced Media, L.P.  Use of any content on this page acknowledges agreement to the terms posted here http://gdx.mlb.com/components/copyright.txt',
 'people': [{'id': 595879,
   'fullName': 'Javier Baez',
   'link': '/api/v1/people/595879',
   'firstName': 'Ednel',
   'lastName': 'Baez',
   'primaryNumber': '28',
   'birthDate': '1992-12-01',
   'currentAge': 29,
   'birthCity': 'Bayamon',
   'birthCountry': 'Puerto Rico',
   'height': '6\' 0"',
   'weight': 190,
   'active': True,
   'primaryPosition': {'code': '6',
    'name': 'Shortstop',
    'type': 'Infielder',
    'abbreviation': 'SS'},
   'useName': 'Javier',
   'middleName': 'Javier',
   'boxscoreName': 'Báez, J',
   'nickName': 'El Mago',
   'gender': 'M',
   'isPlayer': True,
   'isVerified': True,
   'draftYear': 2011,
   'stats': [{'type': {'displayName': 'byDateRange'},
     'group': {'displayName': 'fielding'},
     'exemptions': [],
     'splits': [{'stat': {'gamesPlayed': 88,
        'g

In [23]:
#Testing for Javier Baez - Only returning first entry of hitting 
player_stats(595879, '2021-04-05', '2021-10-01', 2021)

{'hitting': {'gamesPlayed': 91,
  'groundOuts': 71,
  'airOuts': 53,
  'runs': 48,
  'doubles': 9,
  'triples': 2,
  'homeRuns': 22,
  'strikeOuts': 131,
  'baseOnBalls': 15,
  'intentionalWalks': 1,
  'hits': 83,
  'hitByPitch': 7,
  'avg': '.248',
  'atBats': 335,
  'obp': '.292',
  'slg': '.484',
  'ops': '.776',
  'caughtStealing': 3,
  'stolenBases': 13,
  'stolenBasePercentage': '.813',
  'groundIntoDoublePlay': 7,
  'groundIntoTriplePlay': 0,
  'numberOfPitches': 1365,
  'plateAppearances': 361,
  'totalBases': 162,
  'rbi': 65,
  'leftOnBase': 162,
  'sacBunts': 0,
  'sacFlies': 3,
  'babip': '.330',
  'groundOutsToAirouts': '1.34',
  'catchersInterference': 1,
  'atBatsPerHomeRun': '15.23'},
 'fielding': {'gamesPlayed': 123,
  'gamesStarted': 118,
  'assists': 304,
  'putOuts': 192,
  'errors': 22,
  'chances': 518,
  'fielding': '.954.968',
  'position': {'code': '4',
   'name': 'Second Base',
   'type': 'Infielder',
   'abbreviation': '2B'},
  'rangeFactorPerGame': '3.49',
 

## Trying to add dictionaries together to get total stats

In [24]:
#Testing to see if I can combine fielding stats into a dict
#It works except for the 'position' dictionary, which I do not need the info from
#Also combines fielding as a string, but I can fix that later, if I decide to use that stat (thinking I will not for now)
d1 = statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})['people'][0]['stats'][0]['splits'][0]['stat']

d2 = statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})['people'][0]['stats'][0]['splits'][1]['stat']

d3 = dict(d1) # don't do `d3=d1`, you need to make a copy

d3.update(d2) 
try:
    for i, j in d1.items():

        for x, y in d2.items():

            if i == x:

                d3[i]=(j+y)
except: pass
print(d3)

{'gamesPlayed': 62, 'gamesStarted': 52, 'assists': 91, 'putOuts': 213, 'errors': 1, 'chances': 305, 'fielding': '1.000.995', 'position': {'code': '3', 'name': 'First Base', 'type': 'Infielder', 'abbreviation': '1B'}, 'rangeFactorPerGame': '6.81', 'rangeFactorPer9Inn': '8.20', 'innings': '202.0', 'games': 27, 'doublePlays': 10, 'triplePlays': 0, 'throwingErrors': 0}


In [25]:
#Testing for three dictionaries
d1 = statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})['people'][0]['stats'][0]['splits'][0]['stat']

d2 = statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})['people'][0]['stats'][0]['splits'][1]['stat']

d0 = statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})['people'][0]['stats'][0]['splits'][2]['stat']

d3 = dict(d1) # don't do `d3=d1`, you need to make a copy

d3.update(d2) 
try:
    for i, j in d1.items():

        for x, y in d2.items():

                if (i == x):

                    d3[i]=(j+y)
except: pass
d4 = dict(d3)
d4.update(d0)
try:
    for i, j in d3.items():

        for x, y in d0.items():

                if (i == x):

                    d4[i]=(j+y)
except: pass
print(d4)

{'gamesPlayed': 66, 'gamesStarted': 56, 'assists': 91, 'putOuts': 213, 'errors': 1, 'chances': 305, 'fielding': '1.000.995.000', 'position': {'code': '10', 'name': 'Designated Hitter', 'type': 'Hitter', 'abbreviation': 'DH'}, 'rangeFactorPerGame': '0.00', 'rangeFactorPer9Inn': '-.--', 'innings': '0.0', 'games': 4, 'doublePlays': 0, 'triplePlays': 0, 'throwingErrors': 0}


In [26]:
#Making a function to shorten the code - will use this in the stats function to get all the relevant fielding stats added together
def add_dicts(d1, d2):
    d3 = dict(d1)
    d3.update(d2)
    try:
        for i, j in d1.items():

            for x, y in d2.items():

                if (i == x):

                    d3[i]=(j+y)
    except: pass
    return d3

In [27]:
#Testing add_dicts starting with an empty dictionary
empty_dict = {}
dict_list = [d1,d0,d2]
for i in range(len(dict_list)):
    empty_dict = add_dicts(empty_dict,dict_list[i])
empty_dict

{'gamesPlayed': 66,
 'gamesStarted': 56,
 'assists': 91,
 'putOuts': 213,
 'errors': 1,
 'chances': 305,
 'fielding': '1.000.000.995',
 'position': {'code': '3',
  'name': 'First Base',
  'type': 'Infielder',
  'abbreviation': '1B'},
 'rangeFactorPerGame': '6.81',
 'rangeFactorPer9Inn': '8.20',
 'innings': '202.0',
 'games': 27,
 'doublePlays': 10,
 'triplePlays': 0,
 'throwingErrors': 0}

In [28]:
#Getting the if statement to work inside the for loop of the player_stats function and 
#creating a list of unique fielding stats for a player (they duplicated for some reason)
temp_list = []
temp_list2 = []
fielding_stats = statsapi.get("people", {"personIds": 518934, "hydrate": "stats(group=[fielding],type=[byDateRange],\
    startDate=2022-04-05,endDate=2022-08-12,season=2022)"})
for i in range(len(fielding_stats['people'][0]['stats'][0]['splits'])):
    if fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']['position']['code'] in temp_list:
        pass
    else:
        temp_list.append(fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']['position']['code'])
        temp_list2.append(fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']) 
temp_list2

[{'gamesPlayed': 35,
  'gamesStarted': 29,
  'assists': 79,
  'putOuts': 41,
  'errors': 0,
  'chances': 120,
  'fielding': '1.000',
  'position': {'code': '4',
   'name': 'Second Base',
   'type': 'Infielder',
   'abbreviation': '2B'},
  'rangeFactorPerGame': '3.43',
  'rangeFactorPer9Inn': '4.17',
  'innings': '259.0',
  'games': 35,
  'doublePlays': 19,
  'triplePlays': 0,
  'throwingErrors': 0},
 {'gamesPlayed': 27,
  'gamesStarted': 23,
  'assists': 12,
  'putOuts': 172,
  'errors': 1,
  'chances': 185,
  'fielding': '.995',
  'position': {'code': '3',
   'name': 'First Base',
   'type': 'Infielder',
   'abbreviation': '1B'},
  'rangeFactorPerGame': '6.81',
  'rangeFactorPer9Inn': '8.20',
  'innings': '202.0',
  'games': 27,
  'doublePlays': 10,
  'triplePlays': 0,
  'throwingErrors': 0},
 {'gamesPlayed': 4,
  'gamesStarted': 4,
  'assists': 0,
  'putOuts': 0,
  'errors': 0,
  'chances': 0,
  'fielding': '.000',
  'position': {'code': '10',
   'name': 'Designated Hitter',
   'type

In [29]:
#Getting add_dicts function to work to combine all fielding stats into one dictionary - strings will be messed up
dict_A = {}
for i in range(len(temp_list2)):
    dict_A =  add_dicts(dict_A, temp_list2[i])
dict_A

{'gamesPlayed': 109,
 'gamesStarted': 95,
 'assists': 191,
 'putOuts': 234,
 'errors': 2,
 'chances': 427,
 'fielding': '1.000.995.000.992',
 'position': {'code': '5',
  'name': 'Third Base',
  'type': 'Infielder',
  'abbreviation': '3B'},
 'rangeFactorPerGame': '2.81',
 'rangeFactorPer9Inn': '3.09',
 'innings': '352.2',
 'games': 43,
 'doublePlays': 7,
 'triplePlays': 0,
 'throwingErrors': 1}

## Pulling some stats for a team roster/matchup
###### Need to get the date of the game for the player_stats function to work properly - this date will be the end date, with the start of season offset date as the start

In [30]:
#reading json
f = open('roster_dict')
data = json.load(f)

In [31]:
#Looking at the first matchup 

#Getting relevant data from dictionary
matchup_one = data[0]
game_id = matchup_one['game_id']
home_roster = matchup_one['home_roster']
away_roster = matchup_one['away_roster']
date = statsapi.boxscore_data(game_id, timecode=None)['gameId'][0:10].replace('/', '-')

#Pulling stats for each player - had to look up date manually for now

for i in home_roster:
    print(player_stats(i,'2021-04-01', date, 2021))

{'hitting': {'gamesPlayed': 29, 'groundOuts': 41, 'airOuts': 22, 'runs': 19, 'doubles': 5, 'triples': 0, 'homeRuns': 1, 'strikeOuts': 22, 'baseOnBalls': 16, 'intentionalWalks': 0, 'hits': 31, 'hitByPitch': 1, 'avg': '.267', 'atBats': 116, 'obp': '.361', 'slg': '.336', 'ops': '.697', 'caughtStealing': 0, 'stolenBases': 1, 'stolenBasePercentage': '1.000', 'groundIntoDoublePlay': 4, 'groundIntoTriplePlay': 0, 'numberOfPitches': 537, 'plateAppearances': 133, 'totalBases': 39, 'rbi': 7, 'leftOnBase': 44, 'sacBunts': 0, 'sacFlies': 0, 'babip': '.323', 'groundOutsToAirouts': '1.86', 'catchersInterference': 0, 'atBatsPerHomeRun': '116.00'}, 'fielding': {'gamesPlayed': 37, 'gamesStarted': 29, 'assists': 39, 'putOuts': 119, 'errors': 1, 'chances': 159, 'fielding': '1.0001.000.944', 'position': {'code': '5', 'name': 'Third Base', 'type': 'Infielder', 'abbreviation': '3B'}, 'rangeFactorPerGame': '2.83', 'rangeFactorPer9Inn': '3.73', 'innings': '41.0', 'games': 6, 'doublePlays': 1, 'triplePlays': 0

### Troubleshooting again for player_stats, with pitching specifically
###### Was running into issues with pitchers being on the roster but not pitching for the team yet in the season

In [32]:
#Looking at how to get the total stats for a pitcher in the season so far (As they are often traded and a separate entry is created for each team)
pitching_stats = statsapi.get("people", {"personIds": 593974, "hydrate": f"stats(group=[pitching],type=[byDateRange],\
    startDate=2021-04-01,endDate=2021-05-06,season=2021)"})
pitching_stats['people'][0]['stats'][0]['splits'][-1]['stat']

{'gamesPlayed': 13,
 'gamesStarted': 0,
 'groundOuts': 12,
 'airOuts': 7,
 'runs': 5,
 'doubles': 0,
 'triples': 1,
 'homeRuns': 1,
 'strikeOuts': 11,
 'baseOnBalls': 3,
 'intentionalWalks': 0,
 'hits': 12,
 'hitByPitch': 0,
 'avg': '.293',
 'atBats': 41,
 'obp': '.333',
 'slg': '.415',
 'ops': '.748',
 'caughtStealing': 0,
 'stolenBases': 0,
 'stolenBasePercentage': '.---',
 'groundIntoDoublePlay': 2,
 'numberOfPitches': 171,
 'era': '4.22',
 'inningsPitched': '10.2',
 'wins': 2,
 'losses': 1,
 'saves': 2,
 'saveOpportunities': 3,
 'holds': 0,
 'blownSaves': 1,
 'earnedRuns': 5,
 'whip': '1.41',
 'battersFaced': 45,
 'outs': 32,
 'gamesPitched': 13,
 'completeGames': 0,
 'shutouts': 0,
 'strikes': 115,
 'strikePercentage': '.670',
 'hitBatsmen': 0,
 'balks': 1,
 'wildPitches': 2,
 'pickoffs': 0,
 'totalBases': 17,
 'groundOutsToAirouts': '1.71',
 'winPercentage': '.667',
 'pitchesPerInning': '16.03',
 'gamesFinished': 6,
 'strikeoutWalkRatio': '3.67',
 'strikeoutsPer9Inn': '9.28',
 'w

In [33]:
#Looking for Gerrit Cole's stats with function - was returning {} before I fixed the function now
player_stats(543037,'2021-04-01', '2021-05-06', 2021)

{'pitching': {'gamesPlayed': 7,
  'gamesStarted': 7,
  'groundOuts': 29,
  'airOuts': 36,
  'runs': 9,
  'doubles': 3,
  'triples': 1,
  'homeRuns': 3,
  'strikeOuts': 66,
  'baseOnBalls': 3,
  'intentionalWalks': 0,
  'hits': 29,
  'hitByPitch': 1,
  'avg': '.182',
  'atBats': 159,
  'obp': '.201',
  'slg': '.270',
  'ops': '.471',
  'caughtStealing': 0,
  'stolenBases': 2,
  'stolenBasePercentage': '1.000',
  'groundIntoDoublePlay': 3,
  'numberOfPitches': 696,
  'era': '1.61',
  'inningsPitched': '44.2',
  'wins': 4,
  'losses': 1,
  'saves': 0,
  'saveOpportunities': 0,
  'holds': 0,
  'blownSaves': 0,
  'earnedRuns': 8,
  'whip': '0.72',
  'battersFaced': 164,
  'outs': 134,
  'gamesPitched': 7,
  'completeGames': 0,
  'shutouts': 0,
  'strikes': 479,
  'strikePercentage': '.690',
  'hitBatsmen': 1,
  'balks': 0,
  'wildPitches': 2,
  'pickoffs': 0,
  'totalBases': 43,
  'groundOutsToAirouts': '0.81',
  'winPercentage': '.800',
  'pitchesPerInning': '15.58',
  'gamesFinished': 0,


In [34]:
#Looking for Gerrit Cole's stats straight from the API - works fine
statsapi.get("people", {"personIds": 543037, "hydrate": f"stats(group=[pitching],type=[byDateRange],\
    startDate=2021-04-01,endDate=2021-05-06,season=2021)"})['people'][0]['stats'][0]['splits'][-1]['stat']

{'gamesPlayed': 7,
 'gamesStarted': 7,
 'groundOuts': 29,
 'airOuts': 36,
 'runs': 9,
 'doubles': 3,
 'triples': 1,
 'homeRuns': 3,
 'strikeOuts': 66,
 'baseOnBalls': 3,
 'intentionalWalks': 0,
 'hits': 29,
 'hitByPitch': 1,
 'avg': '.182',
 'atBats': 159,
 'obp': '.201',
 'slg': '.270',
 'ops': '.471',
 'caughtStealing': 0,
 'stolenBases': 2,
 'stolenBasePercentage': '1.000',
 'groundIntoDoublePlay': 3,
 'numberOfPitches': 696,
 'era': '1.61',
 'inningsPitched': '44.2',
 'wins': 4,
 'losses': 1,
 'saves': 0,
 'saveOpportunities': 0,
 'holds': 0,
 'blownSaves': 0,
 'earnedRuns': 8,
 'whip': '0.72',
 'battersFaced': 164,
 'outs': 134,
 'gamesPitched': 7,
 'completeGames': 0,
 'shutouts': 0,
 'strikes': 479,
 'strikePercentage': '.690',
 'hitBatsmen': 1,
 'balks': 0,
 'wildPitches': 2,
 'pickoffs': 0,
 'totalBases': 43,
 'groundOutsToAirouts': '0.81',
 'winPercentage': '.800',
 'pitchesPerInning': '15.58',
 'gamesFinished': 0,
 'strikeoutWalkRatio': '22.00',
 'strikeoutsPer9Inn': '13.30'

In [35]:
#Testing for pitcher who swapped teams mid season
#Verifies that we only need the last entry for the cumulative pitching stats
statsapi.get("people", {"personIds": 458677, "hydrate": f"stats(group=[pitching],type=[byDateRange],\
    startDate=2021-04-01,endDate=2021-10-01,season=2021)"})

{'copyright': 'Copyright 2022 MLB Advanced Media, L.P.  Use of any content on this page acknowledges agreement to the terms posted here http://gdx.mlb.com/components/copyright.txt',
 'people': [{'id': 458677,
   'fullName': 'Justin Wilson',
   'link': '/api/v1/people/458677',
   'firstName': 'Justin',
   'lastName': 'Wilson',
   'primaryNumber': '34',
   'birthDate': '1987-08-18',
   'currentAge': 34,
   'birthCity': 'Anaheim',
   'birthStateProvince': 'CA',
   'birthCountry': 'USA',
   'height': '6\' 2"',
   'weight': 205,
   'active': True,
   'primaryPosition': {'code': '1',
    'name': 'Pitcher',
    'type': 'Pitcher',
    'abbreviation': 'P'},
   'useName': 'Justin',
   'middleName': 'James',
   'boxscoreName': 'Wilson, J',
   'nickName': 'J Willy',
   'gender': 'M',
   'isPlayer': True,
   'isVerified': True,
   'draftYear': 2008,
   'stats': [{'type': {'displayName': 'byDateRange'},
     'group': {'displayName': 'pitching'},
     'exemptions': [],
     'splits': [{'stat': {'game

## Player Stats Function Final Vers

In [36]:
#Fixing function to get player stats
#Inputs are the id, start date and end date

#Fix using abbreviation == All

def player_stats(id,start_date, end_date, season):

    #Getting player stats - separately because the API switches their order when fetched at the same time 
    hitting_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[hitting],type=[byDateRange],\
    startDate={start_date},endDate={end_date},season={season})"})
    pitching_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[pitching],type=[byDateRange],\
    startDate={start_date},endDate={end_date},season={season})"})
    fielding_stats = statsapi.get("people", {"personIds": id, "hydrate": f"stats(group=[fielding],type=[byDateRange],\
    startDate={start_date},endDate={end_date},season={season})"})

    #dict where I will add all the collected stats
    stat_dict = {}

    #Temporary storage for fielding dicts for all positions
    fielding_list = []

    #List to check if the stats for a fielding position have already been collected - as they are listed more than once
    pos_codes = []

    #Taking last entry of hitting stats, as that is the cumulative for the date range
    try:
        player_hitting_stats = hitting_stats['people'][0]['stats'][0]['splits'][-1]['stat']
        stat_dict['hitting'] = player_hitting_stats
    except: pass
    
    #Taking last entry of pitching stats, as that is the cumulative for the date range
    try:
        player_pitching_stats = pitching_stats['people'][0]['stats'][0]['splits'][-1]['stat']
        stat_dict['pitching'] = player_pitching_stats
    except: pass

    #Summing cumulative fielding stats for each position played (Team changes now accounted for)
    try:
        for i in range(len(fielding_stats['people'][0]['stats'][0]['splits'])):
            if fielding_stats['people'][0]['stats'][0]['splits'][i]['sport']['abbreviation'] == 'All':
                fielding_list.append(fielding_stats['people'][0]['stats'][0]['splits'][i]['stat']) 
                
        all_fielding = {}

        for i in range(len(fielding_list)):
            all_fielding = add_dicts(all_fielding, fielding_list[i])
        stat_dict['fielding'] = all_fielding
        
        
    except:pass
    return stat_dict

In [37]:
#Testing for Javier Baez - Now works properly for hitting and fielding
player_stats(595879, '2021-04-05', '2021-10-01', 2021)

{'hitting': {'gamesPlayed': 138,
  'groundOuts': 105,
  'airOuts': 83,
  'runs': 80,
  'doubles': 18,
  'triples': 2,
  'homeRuns': 31,
  'strikeOuts': 184,
  'baseOnBalls': 28,
  'intentionalWalks': 2,
  'hits': 133,
  'hitByPitch': 13,
  'avg': '.265',
  'atBats': 502,
  'obp': '.319',
  'slg': '.494',
  'ops': '.813',
  'caughtStealing': 5,
  'stolenBases': 18,
  'stolenBasePercentage': '.783',
  'groundIntoDoublePlay': 12,
  'groundIntoTriplePlay': 0,
  'numberOfPitches': 2065,
  'plateAppearances': 547,
  'totalBases': 248,
  'rbi': 87,
  'leftOnBase': 235,
  'sacBunts': 0,
  'sacFlies': 3,
  'babip': '.352',
  'groundOutsToAirouts': '1.27',
  'catchersInterference': 1,
  'atBatsPerHomeRun': '16.19'},
 'fielding': {'gamesPlayed': 135,
  'gamesStarted': 130,
  'assists': 327,
  'putOuts': 210,
  'errors': 24,
  'chances': 561,
  'fielding': '.954.968',
  'position': {'code': '4',
   'name': 'Second Base',
   'type': 'Infielder',
   'abbreviation': '2B'},
  'rangeFactorPerGame': '3.

## Pulling Stats from roster_date_dict for a team matchup

In [38]:
#reading json
f = open('roster_date_dict')
roster_matchup = json.load(f)
roster_matchup

[{'game_id': 634224,
  'date': '2021-05-06',
  'home_roster': [518934,
   519317,
   592450,
   642180,
   650402,
   645801,
   543305,
   543309,
   458731,
   656061,
   570666,
   547973,
   593334,
   650633,
   446372,
   642528,
   656756,
   593974,
   592791],
  'away_roster': [514888,
   488726,
   608324,
   670541,
   493329,
   621043,
   663656,
   676801,
   455117,
   650556,
   656232,
   592288,
   677651,
   425844,
   664299,
   548384,
   501925,
   592773,
   664353]},
 {'game_id': 634283,
  'date': '2021-05-06',
  'home_roster': [457705,
   666160,
   656555,
   544369,
   664761,
   595284,
   665155,
   546318,
   554430,
   621237,
   502624,
   641401,
   656322,
   660853,
   621107,
   571735,
   445213,
   519043,
   593576,
   605400,
   624133,
   592826],
  'away_roster': [543939,
   596129,
   621438,
   543768,
   598265,
   649966,
   641856,
   456715,
   541645,
   676424,
   502202,
   656420,
   623352,
   605288,
   641778,
   642547,
   656876,

In [39]:
#Looking at the first matchup 

#Getting relevant data from dictionary
first_game = roster_matchup[0]
date = first_game['date']
game_id = first_game['game_id']
home_roster = matchup_one['home_roster']
away_roster = matchup_one['away_roster']

#Pulling stats for each player - using season_start_end function to fetch the season start date

for i in home_roster:
    print(player_stats(i,season_start_end(2021)[0], date, 2021))

{'hitting': {'gamesPlayed': 29, 'groundOuts': 41, 'airOuts': 22, 'runs': 19, 'doubles': 5, 'triples': 0, 'homeRuns': 1, 'strikeOuts': 22, 'baseOnBalls': 16, 'intentionalWalks': 0, 'hits': 31, 'hitByPitch': 1, 'avg': '.267', 'atBats': 116, 'obp': '.361', 'slg': '.336', 'ops': '.697', 'caughtStealing': 0, 'stolenBases': 1, 'stolenBasePercentage': '1.000', 'groundIntoDoublePlay': 4, 'groundIntoTriplePlay': 0, 'numberOfPitches': 537, 'plateAppearances': 133, 'totalBases': 39, 'rbi': 7, 'leftOnBase': 44, 'sacBunts': 0, 'sacFlies': 0, 'babip': '.323', 'groundOutsToAirouts': '1.86', 'catchersInterference': 0, 'atBatsPerHomeRun': '116.00'}, 'fielding': {'gamesPlayed': 37, 'gamesStarted': 29, 'assists': 39, 'putOuts': 119, 'errors': 1, 'chances': 159, 'fielding': '.9441.0001.000', 'position': {'code': '3', 'name': 'First Base', 'type': 'Infielder', 'abbreviation': '1B'}, 'rangeFactorPerGame': '5.39', 'rangeFactorPer9Inn': '7.73', 'innings': '113.0', 'games': 18, 'doublePlays': 8, 'triplePlays':

In [40]:
#Comparing some results of the function to a proper get request from the API
#Results match my function
roster_list = [518934,
   519317,
   592450,
   642180,
   650402,
   645801,
   543305,
   543309,
   458731,
   656061,
   570666,
   547973,
   593334,
   650633,
   446372,
   642528,
   656756,
   593974,
   592791]
for i in roster_list:
    print(statsapi.get("people", {"personIds": i, "hydrate": f"stats(group=[pitching,fielding,hitting],type=[byDateRange],\
    startDate=2021-04-01,endDate=2021-05-06,season=2021)"})['people'][0]['stats'])

[{'type': {'displayName': 'byDateRange'}, 'group': {'displayName': 'fielding'}, 'exemptions': [], 'splits': [{'stat': {'gamesPlayed': 13, 'gamesStarted': 12, 'assists': 25, 'putOuts': 19, 'errors': 0, 'chances': 44, 'fielding': '1.000', 'position': {'code': '4', 'name': 'Second Base', 'type': 'Infielder', 'abbreviation': '2B'}, 'rangeFactorPerGame': '3.38', 'rangeFactorPer9Inn': '3.84', 'innings': '103.2', 'games': 13, 'doublePlays': 8, 'triplePlays': 0, 'throwingErrors': 0}, 'team': {'id': 147, 'name': 'New York Yankees', 'link': '/api/v1/teams/147'}, 'sport': {'id': 1, 'link': '/api/v1/sports/1', 'abbreviation': 'MLB'}, 'numTeams': 1}, {'stat': {'gamesPlayed': 18, 'gamesStarted': 12, 'assists': 3, 'putOuts': 94, 'errors': 0, 'chances': 97, 'fielding': '1.000', 'position': {'code': '3', 'name': 'First Base', 'type': 'Infielder', 'abbreviation': '1B'}, 'rangeFactorPerGame': '5.39', 'rangeFactorPer9Inn': '7.73', 'innings': '113.0', 'games': 18, 'doublePlays': 8, 'triplePlays': 0, 'throw

## Pulling more specific stats from roster_date_dict for a player
###### Starting with only a few stats

In [41]:
#Pulling stats for one player and adding to a df for future use

#Stats we will be using for now
hitting_stats = ['runs', 'rbi', 'homeRuns', 'hits', 'avg', 'ops', 'groundIntoDoublePlay']

#Initializing empty df with specified column names
df = pd.DataFrame(columns = hitting_stats)

#Storing all stats as a list
stat_list = [player_stats(595879, '2021-04-05', '2021-10-01', 2021)['hitting'][i] for i in hitting_stats]

#appending stats to the df
df.loc[len(df)] = stat_list


In [42]:
#Testing looping through one roster - Need to be more efficient here, takes 20 seconds to execute

#Stats we will be using for now
hitting_stats = ['runs', 'rbi', 'homeRuns', 'hits', 'avg', 'ops', 'groundIntoDoublePlay']

#Initializing empty df with specified column names
df = pd.DataFrame(columns = hitting_stats)

player_list = [595879,518934]

for j in roster_list:

    #Storing all stats as a list - using roster_list from above
    stat_list = [player_stats(j, '2021-04-05', '2021-10-01', 2021)['hitting'][i] for i in hitting_stats]

    #appending stats to the df
    df.loc[len(df)] = stat_list

In [66]:
#Testing looping through a few players - Need to be more efficient here, takes about the same time as above to execute

#Stats we will be using for now
hitting_stats = ['runs', 'rbi', 'homeRuns', 'hits', 'avg', 'ops', 'groundIntoDoublePlay']

#Initializing empty df with specified column names
df = pd.DataFrame(columns = hitting_stats)


#Storing all stats as a list - using roster_list from above
stat_list = [[player_stats(j, '2021-04-05', '2021-10-01', 2021)['hitting'][i] for i in hitting_stats] for j in roster_list]

#appending stats to the df
for i in range(len(stat_list)):
    df.loc[len(df)] = stat_list[i]

## Testing putting data directly into a df - Test this for a team roster later

In [112]:
#Creating empty df from keys in the dicts of each data group - using players to fill

df = pd.DataFrame(columns = player_stats(518934, '2021-04-05', '2021-10-01', 2021)['hitting'].keys())
df2 = pd.DataFrame(columns = player_stats(518934, '2021-04-05', '2021-10-01', 2021)['fielding'].keys())
df3 = pd.DataFrame(columns = player_stats(656061, '2021-04-05', '2021-10-01', 2021)['pitching'].keys())

In [113]:
df.loc[len(df)] = player_stats(518934, '2021-04-05', '2021-10-01', 2021)['hitting']

In [114]:
df2.loc[len(df2)] = player_stats(518934, '2021-04-05', '2021-10-01', 2021)['fielding']

In [None]:
#Combining all the dfs
pd.concat([df,df2,df3],axis=1)