In [1]:
import pandas as pd
import numpy as np
import statsapi
import os
from tqdm import tqdm
import pickle

In [2]:
# Retrosheet Player ID's to name
names = {}
with open("names.csv", 'r') as f:
    for line in f:
        line = line.strip().split(',')
        names[line[0]] = f"{line[2]} {line[1]}"
        
        
IDS = {}
with open("ids.csv", 'r') as f:
    
    for line in f:
        line = line.strip().split(',')
        if line[1] not in IDS.keys():
            IDS[line[1]] = line[0]

In [3]:
# Get all data files
TEAMS = ["ANA","ARI","ATL","BAL","BOS","CHA","CHN","CIN","CLE","COL","DET","HOU","KCA","LAN","MIA","MIL","MIN","NYA",\
"NYN","OAK","PHI","PIT","SDN","SEA","SFN","SLN","TBA","TEX","TOR","WAS"]

AMERICAN = ["ANA", "BAL", "BOS", "CHA", "CLE", "DET", "HOU", "KCA", "MIN", "NYA", "OAK", "SEA", "TBA", "TEX", "TOR"]

YEARS = [str(yr) for yr in range(2018, 2020)]


files = []
for y in YEARS:
    for t in TEAMS:
        if t in AMERICAN:
            file = f"Data/{y}{t}.EVA"
        else:
            file = f"Data/{y}{t}.EVN"
            
        if not os.path.exists(file):
            print("404:", file)
        else:
            files.append(file)

In [4]:
# turn data files into array of at-bats 
atBats = []


for FILENAME in tqdm(files):
    f = open(FILENAME, 'r')
    homePitcher = None
    awayPitcher = None
    gameCtr = 0


    for line in f:
        line = line.strip().split(',')

        #newGame
        if line[0] == 'id':
            gameCtr += 1
            date = line[1][7:9] + '/' + line[1][9:11]
            year = line[1][3:7]

        #starting pitcher
        if line[0] == 'start' and line[-1] == '1':
            if line[-3] == '0':
                awayPitcher = line[1]
            elif line[-3] == '1':
                homePitcher = line[1]
                

        #Pitching change
        if line[0] == 'sub' and line[5] == '1':
                
            if line[3] == '1':
                homePitcher = line[1]
            elif line[3] == '0':
                awayPitcher = line[1]

        #Batter
        # if line[2] == 0
        if line[0] == 'play':
            batter = line[3]
            
            if line[2] == "0":
                pitcher = homePitcher
            elif line[2] == "1":
                pitcher = awayPitcher

        #Hit or not
            if line[-1][0] in ["S", 'D', 'T', 'H'] or line[-1][0:2] == 'HR':
                hit = True
            elif line[-1][0] in ["1", "2", "3", "4", "5", "6", "7", "8", "9"]:
                hit = False
            else:
                hit = "skip"

            #print(f"Matchup: {batter} vs {pitcher} -- result: {event}")
            if hit != "skip":
                atBats.append((names[batter], names[pitcher], int(hit), date, year))
                
atBats = np.array(atBats)

100%|██████████| 60/60 [00:01<00:00, 58.94it/s]


In [5]:
def getHittingData(batterID):
    try:
        r = statsapi.player_stat_data(batterID, group="hitting", type='career')['stats'][0]['stats']
        return [r['avg'], r['slg'], r['strikeOuts']]
    except IndexError:
        return None
    
def getPitchingData(pitcherID):
    try:
        r = statsapi.player_stat_data(pitcherID, group="pitching", type='career')['stats'][0]['stats']
        return [r['avg'], r['obp'], r['homeRunsPer9']]
    except IndexError:
        return None

        

In [6]:
statsData = []
target = []
battingStats = {}
pitchingStats = {}

with tqdm(total=len(atBats), position=0, leave=True) as pbar:
    for ab in tqdm(atBats, position=0, leave=True):
        try:
            batterID = IDS[ab[0]]
            pitcherID = IDS[ab[1]]
            outcome = ab[2]

            if batterID not in battingStats.keys():
                battingStats[batterID] = getHittingData(batterID)
            if pitcherID not in pitchingStats.keys():
                pitchingStats[pitcherID] = getPitchingData(pitcherID)


            if battingStats[batterID] is None or pitchingStats[pitcherID] is None:
                continue
            else:
                statsData.append(battingStats[batterID] + pitchingStats[pitcherID])
                target.append(outcome)
          
            pbar.update()

        except KeyError:
            try:
                IDS[ab[0]] = statsapi.lookup_player(ab[0], season=ab[4])[0]['id']
            except IndexError:
                continue
    

100%|██████████| 256571/256571 [31:08<00:00, 137.34it/s] 
 97%|█████████▋| 248584/256571 [31:08<01:00, 133.07it/s]


In [7]:
data = np.array(statsData)
target = np.array(target)

with open("Serialized/data.p", "wb") as f:
    pickle.dump(data, f)
    
with open('Serialized/target.p', 'wb') as f:
    pickle.dump(target, f)