In [None]:
import pandas as pd
import statsapi
import datetime
import string
import json
import os

from bs4 import BeautifulSoup
import requests

In [None]:
# Boolean for advantageous matchups (L vs R)
def adv_matchup(row):
    val = 0
    if row['batter_hand'] == 'S':
        val = 1
    elif row['batter_hand'] == 'R' and row['pitcher_hand'] == 'L':
        val = 1
    elif row['batter_hand'] == 'L' and row['pitcher_hand'] == 'R':
        val = 1
    return val

for file in os.scandir("data\\matchups"):
    matchups = pd.read_csv(file, index_col=0)
    matchups['adv_ab'] = matchups.apply(adv_matchup, axis=1)
    matchups.to_csv(f"data\\matchups\\{file.name}")


In [None]:
# Change switch hitters to side
def fix_switch(row):
    hand = row['batter_hand']
    if hand == 'S' and row['pitcher_hand'] == 'R':
        hand = 'L'
    elif hand == 'S' and row['pitcher_hand'] == 'L':
        hand = 'R'
    return(hand)

matchups['batter_hand'] = matchups.apply(fix_switch, axis=1)
matchups.to_csv("data\\matchups\\all_matchups.csv")


In [None]:
# Function to add h2h stats to matchups
def add_h2h_stats(row):
    h2h_stats = matchups.loc[(matchups['batter_id'] == row['batter_id']) & (matchups['pitcher_id'] == row['pitcher_id'])]

    num_pa = len(h2h_stats)
    outcomes = h2h_stats.groupby(['outcome']).size()
    indexes = outcomes.index.tolist()

    singles = 0
    for event in ['h_S1','h_S2','h_S3','h_S4','h_S5','h_S6','h_S7','h_S8','h_S9']:
        if event in indexes:
            singles += int(outcomes[event])

    doubles = 0
    for event in ['h_D1','h_D2','h_D3','h_D4','h_D5','h_D6','h_D7','h_D8','h_D9']:
        if event in indexes:
            doubles += int(outcomes[event])

    triples = 0
    for event in ['h_T1','h_T2','h_T3','h_T4','h_T5','h_T6','h_T7','h_T8','h_T9']:
        if event in indexes:
            triples += int(outcomes[event])

    home_runs = 0
    if 'h_HR' in indexes:
        home_runs += int(outcomes['h_HR'])

    bb = 0
    if 'p_W' in indexes:
        bb += int(outcomes['p_W'])

    ibb = 0
    if 'IW' in indexes:
        ibb += int(outcomes['IW'])

    k = 0
    if 'p_K' in indexes:
        k += int(outcomes['p_K'])

    hbp = 0
    if 'HP' in indexes:
        hbp += int(outcomes['HP'])

    at_bats = outcomes.sum()-bb-hbp

    wOBA_row = wOBA_constants.loc['Average']
    wOBA_num = (wOBA_row['wBB']*bb)+(wOBA_row['wHBP']*hbp)+(wOBA_row['w1B']*singles)+(wOBA_row['w2B']*doubles)+(wOBA_row['w3B']*triples)+(wOBA_row['wHR']*home_runs)
    wOBA_denom = at_bats + (bb - ibb) + hbp

    if wOBA_denom:
        wOBA = round(wOBA_num/wOBA_denom, 3)
    else:
        wOBA = 0

    row['h2h_wOBA']  = wOBA
    row['h2h_pa']    = num_pa
    row['h2h_1B_pa'] = round(singles/num_pa, 3)
    row['h2h_2B_pa'] = round(doubles/num_pa, 3)
    row['h2h_3B_pa'] = round(triples/num_pa, 3)
    row['h2h_HR_pa'] = round(home_runs/num_pa, 3)
    row['h2h_BB_pa'] = round(bb/num_pa, 3)
    row['h2h_K_pa']  = round(k/num_pa, 3)

    return(row)


matchups = matchups.apply(add_h2h_stats, axis=1)
matchups.to_csv("data\\matchups\\all_matchups_test.csv")


In [None]:
# Combine batter gamelogs
count = 0
all_gamelogs = []
for batter in batter_list:

    batter_gamelog = pd.read_csv(f"data\\gamelogs\\batters2\\{batter}.csv", index_col=0)
    batter_position = batter_gamelog['pos'].mode().values[0]
    batter_gamelog = batter_gamelog[['game_code','imputed','r_wOBA','r_PA','r_1B_PA','r_2B_PA','r_3B_PA','r_HR_PA',
                                     'r_BB_PA','r_IBB_PA','r_HBP_PA','r_GDP_PA','r_K_PA','r_SF_PA','r_SH_PA','r_avg_pf',
                                     'h_wOBA','h_PA','h_1B_PA','h_2B_PA','h_3B_PA','h_HR_PA','h_BB_PA',
                                     'h_IBB_PA','h_HBP_PA','h_GDP_PA','h_K_PA','h_SF_PA','h_SH_PA','h_avg_pf']]
    batter_gamelog.columns = ['game_code','b_imputed','b_r_wOBA','b_r_PA','b_r_1B_PA','b_r_2B_PA','b_r_3B_PA','b_r_HR_PA',
                              'b_r_BB_PA','b_r_IBB_PA','b_r_HBP_PA','b_r_GDP_PA','b_r_K_PA','b_r_SF_PA','b_r_SH_PA','b_r_avg_pf',
                              'b_h_wOBA','b_h_PA','b_h_1B_PA','b_h_2B_PA','b_h_3B_PA','b_h_HR_PA','b_h_BB_PA',
                              'b_h_IBB_PA','b_h_HBP_PA','b_h_GDP_PA','b_h_K_PA','b_h_SF_PA','b_h_SH_PA','b_h_avg_pf']
    batter_gamelog['b_pos'] = batter_position
    batter_gamelog['batter_game_code'] = batter_gamelog.apply(lambda row: "{}-{}".format(batter, row['game_code']), axis=1)
    batter_gamelog = batter_gamelog.drop(columns=['game_code'])
    batter_game_code = batter_gamelog.pop('batter_game_code')
    batter_gamelog.insert(0, 'batter_game_code', batter_game_code)

    all_gamelogs.append(batter_gamelog)
    print(f"Batters: {round(count/len(batter_list)*100, 2)}%")
    count += 1

all_gamelogs = pd.concat(all_gamelogs)
all_gamelogs.reset_index(drop=True, inplace=True)
all_gamelogs.to_csv("data\\gamelogs\\all_gamelogs.csv")


In [None]:
# Combine pitcher gamelogs
count = 0
all_gamelogs = []
for pitcher in pitcher_list:

    pitcher_gamelog = pd.read_csv(f"data\\gamelogs\\pitchers2\\{pitcher}.csv", index_col=0)
    pitcher_gamelog = pitcher_gamelog[['game_code','imputed','r_wOBA','r_PA','r_1B_PA','r_2B_PA','r_3B_PA','r_HR_PA',
                                     'r_BB_PA','r_IBB_PA','r_HBP_PA','r_GDP_PA','r_K_PA','r_SF_PA','r_avg_pf',
                                     'h_wOBA','h_PA','h_1B_PA','h_2B_PA','h_3B_PA','h_HR_PA','h_BB_PA',
                                     'h_IBB_PA','h_HBP_PA','h_GDP_PA','h_K_PA','h_SF_PA','h_avg_pf']]
    pitcher_gamelog.columns = ['game_code','p_imputed','p_r_wOBA','p_r_PA','p_r_1B_PA','p_r_2B_PA','p_r_3B_PA','p_r_HR_PA',
                              'p_r_BB_PA','p_r_IBB_PA','p_r_HBP_PA','p_r_GDP_PA','p_r_K_PA','p_r_SF_PA','p_r_avg_pf',
                              'p_h_wOBA','p_h_PA','p_h_1B_PA','p_h_2B_PA','p_h_3B_PA','p_h_HR_PA','p_h_BB_PA',
                              'p_h_IBB_PA','p_h_HBP_PA','p_h_GDP_PA','p_h_K_PA','p_h_SF_PA','p_h_avg_pf']
    pitcher_gamelog['pitcher_game_code'] = pitcher_gamelog.apply(lambda row: "{}-{}".format(pitcher, row['game_code']), axis=1)
    pitcher_gamelog = pitcher_gamelog.drop(columns=['game_code'])
    pitcher_game_code = pitcher_gamelog.pop('pitcher_game_code')
    pitcher_gamelog.insert(0, 'pitcher_game_code', pitcher_game_code)

    all_gamelogs.append(pitcher_gamelog)
    print(f"Pitchers: {round(count/len(pitcher_list)*100, 2)}%")
    count += 1

all_gamelogs = pd.concat(all_gamelogs)
all_gamelogs.reset_index(drop=True, inplace=True)
all_gamelogs.to_csv("data\\gamelogs\\pitcher_gamelogs.csv")


In [None]:
# Merge matchups with gamelogs
final_matchups = all_matchups.merge(batter_gamelogs, on='batter_game_code', how='left')
final_matchups = final_matchups.merge(pitcher_gamelogs, on='pitcher_game_code', how='left')
final_matchups = final_matchups.drop_duplicates()
final_matchups = final_matchups.loc[final_matchups['b_pos'] != 'P']
final_matchups.to_csv("data\\matchups\\final_matchups.csv")

In [None]:
# Convert batter/pitcher hands to boolean
def convert_hands(row):
    if row['batter_hand'] == 'R':
        row['batter_hand'] = 0
    else:
        row['batter_hand'] = 1
    
    if row['pitcher_hand'] == 'R':
        row['pitcher_hand'] = 0
    else:
        row['pitcher_hand'] = 1

    return(row)

matchups = matchups.apply(convert_hands, axis=1)

In [None]:
# Reduce the number of outcomes
def fix_outcomes(row):
    outcome = row['outcome']
    if outcome in ['h_SX','h_D','h_G','h_+','h_.','h_-','p_BK']:
        outcome = 'BAD'
    elif outcome in ['h_S1','h_S2','h_S3','h_S4','h_S5','h_S6','h_S7','h_S8','h_S9']:
        outcome = '1B'
    elif outcome in ['h_D1','h_D2','h_D3','h_D4','h_D5','h_D6','h_D7','h_D8','h_D9']:
        outcome = '2B'
    elif outcome in ['h_T1','h_T2','h_T3','h_T4','h_T5','h_T6','h_T7','h_T8','h_T9']:
        outcome = '3B'
    elif outcome in ['h_HR']:
        outcome = 'HR'
    elif outcome in ['h_E1','h_E2','h_E3','h_E4','h_E5','h_E6','h_E7','h_E8','h_E9']:
        outcome = 'E'
    elif outcome in ['h_DG']:
        outcome = 'DG'
    elif outcome in ['h_1','h_2','h_3','h_4','h_5','h_6','h_7','h_8','h_9']:
        outcome = 'GO'
    elif outcome in ['p_K']:
        outcome = 'SO'
    elif outcome in ['p_W']:
        outcome = 'BB'
    elif outcome in ['p_IW']:
        outcome = 'IBB'
    elif outcome in ['p_HP']:
        outcome = 'HBP'
    return(outcome)

matchups['outcome'] = matchups.apply(fix_outcomes, axis=1)
matchups = matchups[matchups['outcome'] != 'BAD']