In [1]:
import pandas as pd
import string
import os

import statsapi

In [2]:
player_search = pd.read_csv("data\\player_search.csv", encoding = "ISO-8859-1")
team_search = pd.read_csv("data\\team_search.csv")

In [None]:
# Get all plate appearances for 2015-2021 seasons
years = [2015,2016,2017,2018,2019,2020,2021]
for year in years:
    matchup_data = []
    for file in os.scandir(f"data\\play-by-play\\{year}"):
        f = open(file.path)
        line = f.readline().strip()

        while line != "":
            parts = line.split(",")

            # Get starting pitchers
            if parts[0] == "id":
                while parts[0] != "play":
                    line = f.readline().strip()
                    parts = line.split(",")
                    if parts[0] == "start" and parts[-1] == "1":
                        if parts[3] == "0":
                            away_pitcher = parts[1]
                        else:
                            home_pitcher = parts[1]

            # Get matchups data
            if parts[0] == "play":
                # Get home/away and player ID's
                batter = int(player_search.query(f"key_retro == '{parts[3]}'")['key_mlbam'])
                batter_home = 0
                pitcher = int(player_search.query(f"key_retro == '{home_pitcher}'")['key_mlbam'])
                pitcher_home = 1

                if parts[2] == "1":
                    batter_home = 1
                    pitcher = int(player_search.query(f"key_retro == '{away_pitcher}'")['key_mlbam'])
                    pitcher_home = 0

                outcome = ""

                # Handle balks, intentional walks, HBP, K, and BB
                if parts[-1][:2] in {"BK", "IW", "HP"}:
                    outcome = "p_" + parts[-1][:2]
                elif parts[-1][0] in {"K", "I", "W"}:
                    outcome = "p_" + parts[-1][0]

                # Get pitch outcome if resulted in contact
                pitches = parts[5]
                if len(pitches) > 0 and pitches[-1] == "X":
                    play_parts = parts[6].split("/")
                    main_play = play_parts[0]
                    play = main_play.split(".")[0]

                    if play[0] == "H":
                        play = "HR"
                    elif play[0] in string.digits:
                        play = play[0]
                    elif play[0] in {"S", "D", "T"}:
                        play = play[:2]
                        # Try to get first ball handler
                        if len(play) < 2:
                            try:
                                handlers = play_parts[1]
                                if handlers in string.digits:
                                    play = play[0] + handlers[0]
                            except IndexError:
                                play = play[0] + "X"
                    elif play[:2] == "FC":
                        # Some data doesn't list fielder
                        if len(play) > 2:
                            play = play[2]
                        else:
                            # Handle sacrifice bunts
                            if play_parts[1] == 'SH':
                                play = play_parts[2][2]
                            else:
                                play = play_parts[1][1]
                    
                    outcome = "h_" + play
                    
                # Ignore catcher interference and ambiguous singles.
                if outcome not in {"h_C", "h_S"} and outcome != "":
                    matchup_data.append([parts[1], batter, batter_home, pitcher, pitcher_home, outcome])

            # Handle pitcher changes.
            if parts[0] == "sub":
                if parts[-1] == "1":
                    if parts[3] == "0":
                        away_pitcher = parts[1]
                    else:
                        home_pitcher = parts[1]
            
            line = f.readline().strip()
                        
        f.close()      

    matchup_data = pd.DataFrame(matchup_data, columns=['inning','batter_id','batter_home','pitcher_id','pitcher_home','outcome'])
    matchup_data.to_csv(f'data\\matchups\\{year}_matchups.csv')


In [4]:
# Get schedules for each team between 2015-2021
years = [2015,2016,2017,2018,2019,2020,2021]
for team_id in team_search['statsapi_id']:
    for year in years:
        team_schedule = []
        try:
            schedule = statsapi.schedule(start_date=f"{year}-01-01", end_date=f"{year}-12-31", team=team_id)
        except:
            print(team_id)
            continue
        for game in schedule:
            if game['game_type'] == 'R':
                if game['home_id'] == team_id:
                    home = 1
                else:
                    home = 0

                game_record = [game['game_id'], game['game_date'], game['venue_id'], home]
                team_schedule.append(game_record)
            else:
                continue
        
        team_schedule = pd.DataFrame(team_schedule, columns=['game_id','game_date','venue_id','home'])
        file_code = team_search.query(f"statsapi_id == {team_id}")['file_code'].values[0]
        file_path = f'data\\schedules\\{file_code}_{year}.csv'
        if os.path.exists(file_path):
            continue
        else:
            team_schedule.to_csv(file_path)


In [3]:
# Get team boxscores for batting/pitching stats
boxscore = statsapi.boxscore_data(565997)

game_date = boxscore['gameId'].split("/")
game_date = "{}-{}-{}".format(game_date[0], game_date[1], game_date[2])

away_batters = boxscore['away']['batters']
away_pitchers = boxscore['away']['pitchers']
away_bullpen = boxscore['away']['bullpen']

count = 0
away_boxscore = []
for batter in boxscore['awayBatters']:
    if count == 0:
        count += 1
        continue
    
    line = [batter['personId'], (int(batter['ab'])+int(batter['bb'])), int(batter['ab']), int(batter['h']),
            (int(batter['h'])-int(batter['doubles'])-int(batter['triples'])-int(batter['hr'])),
            int(batter['doubles']), int(batter['triples']), int(batter['hr']),
            int(batter['rbi']), int(batter['r']), int(batter['bb']), int(batter['k'])]
    away_boxscore.append(line)

home_batters = boxscore['home']['batters']
home_pitchers = boxscore['home']['pitchers']
home_bullpen = boxscore['home']['bullpen']

count = 0
home_boxscore = []
for batter in boxscore['homeBatters']:
    if count == 0:
        count += 1
        continue
    
    line = [batter['personId'], (int(batter['ab'])+int(batter['bb'])), int(batter['ab']), int(batter['h']),
            (int(batter['h'])-int(batter['doubles'])-int(batter['triples'])-int(batter['hr'])),
            int(batter['doubles']), int(batter['triples']), int(batter['hr']),
            int(batter['rbi']), int(batter['r']), int(batter['bb']), int(batter['k'])]
    home_boxscore.append(line)

columns = ['batter_id','PA','AB','H','Singles','Doubles','Triples','HR','RBI','R','BB','K']
