In [13]:
import os
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from joblib import load
pd.options.mode.chained_assignment = None  # default='warn'

In [14]:
bat = load_model('../core/models/model_batting.h5')
pitch = load_model('../core/models/model_pitching.h5')
bat_scaler = load('../core/models/batting_scaler.save')
pitch_scaler = load('../core/models/pitching_scaler.save')

In [15]:
gl = pd.read_csv('../core/data/retrosheet/gamelogs/GL2015.csv')

In [16]:
gl

Unnamed: 0,visit_team,home_team,visit_score,home_score,game_length_outs,night_game,park_id,visit_manager_id,home_manager_id,visit_sp_id,...,home_player_4_id,home_player_5_id,home_player_6_id,home_player_7_id,home_player_8_id,home_player_9_id,year,month,day,home_win
0,MIN,DET,0,4,51,0,DET05,molip001,ausmb001,hughp001,...,martv001,martj006,cespy001,castn001,avila001,iglej001,2015,4,6,1
1,CLE,HOU,0,2,51,1,HOU03,frant001,hinca001,klubc001,...,gatte001,cartc002,castj006,lowrj001,rasmc001,marij002,2015,4,6,1
2,CHW,KCR,1,10,51,0,KAN06,ventr001,yoste001,samaj001,...,hosme001,morak001,gorda001,riosa002,peres002,infao001,2015,4,6,1
3,TOR,NYY,6,1,54,0,NYC21,gibbj001,giraj001,hutcd001,...,teixm001,mccab002,headc001,rodra001,drews001,gregd001,2015,4,6,0
4,TEX,OAK,0,8,51,1,OAK01,banij001,melvb001,gally001,...,butlb003,davii001,lawrb002,vogts001,semim001,sogae001,2015,4,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2423,CHC,MIL,3,1,54,0,MIL06,maddj801,counc001,hared001,...,davik003,santd002,pereh001,seguj002,maldm001,lopej004,2015,10,4,0
2424,WSN,NYM,0,1,51,0,NYC20,willm003,collt801,roart001,...,cespy001,dudal001,darnt001,confm001,tejar001,degrj001,2015,10,4,1
2425,FLA,PHI,2,7,51,0,PHI13,jennd801,mackp101,conla001,...,ruf-d001,franj004,blana001,krate001,ruppc001,buchd001,2015,10,4,1
2426,CIN,PIT,0,4,51,0,PIT08,pricb801,hurdc001,smitj004,...,walkn001,marts002,alvap001,cervf001,mercj002,happj001,2015,10,4,1


In [17]:
columns = {
    'batting': [],
    'pitching': []
}

In [18]:
batters = pd.read_csv('../core/output/batters.csv')
batter_years = pd.read_csv('../core/output/batting.csv')
batters_not_counted = list(batter_years[~batter_years['retroID']
                                        .isin(batters['retroID'])]['retroID'].values)
pitchers = pd.read_csv('../core/output/pitchers.csv')
pitcher_years = pd.read_csv('../core/output/pitching.csv')
bat_scaler = load('../core/models/batting_scaler.save')
pitch_scaler = load('../core/models/pitching_scaler.save')
scalers = {
    'batting': bat_scaler,
    'pitching': pitch_scaler
}
career_features = {
    'batting': [
        'G', 'AB', 'PA', 'R', 'H', '1B', '2B', '3B',
        'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB',
        'HBP', 'SH', 'SF', 'GIDP'
    ],
    'pitching': [
        'CG', 'SHO', 'H', 'ER', 'HR', 'BB', 'SO',
        'BAOpp', 'ERA', 'IBB', 'WP', 'HBP', 'BK',
        'BFP', 'GF', 'R', 'SH', 'SF', 'GIDP'
    ]
}
unwanted_features = {
    'batting': ['retroID', 'G', 'AB', '1B', 'RBI', 'wOBA', 'Batting'],
    'pitching': ['IPouts', 'BFP', 'R', 'Pitching']
}
players = {
    'batting': {
        'players': batters,
        'years': batter_years
    },
    'pitching': {
        'players': pitchers,
        'years': pitcher_years
    }
}

In [19]:
def to_tensor_input(scaler, player, label):
    scalers[label] = scaler
    return scaler.transform(player.values.reshape(-1, player.shape[0]))[0]


def convert_single_player(retro_id, year, player_type_label):
    scaler = scalers[player_type_label]
    if retro_id in batters_not_counted:
        return np.zeros(shape=(1, 30))
    player_table = players[player_type_label]['players']
    player_so_far_table = players[player_type_label]['years']
    player = player_table[player_table['retroID'] == retro_id]
    player_so_far = player_so_far_table[(player_so_far_table['retroID'] == retro_id)
                                        & (player_so_far_table['yearID'] <= year)]
    if not player.size | player_so_far.size:
        print('Handled: {}'.format(retro_id))
        return np.zeros(shape=(1, 30))
    player_so_far = player_so_far.groupby('retroID').sum()
    features = career_features[player_type_label]
    try:
        for column in player[features]:
            player.iloc[0][column] = player_so_far.iloc[0][column]
    except:
        print(retro_id)
    player_columns_to_drop = unwanted_features[player_type_label]
    player = player.drop(columns=player_columns_to_drop)
    if not len(list(columns[player_type_label])):
        columns[player_type_label] = player.columns
    return to_tensor_input(scaler, player.T, player_type_label)


def get_batter_as_tensor_input(batter, year):
    scaler = scalers['batting']
    player = batters[batters['retroID'] == batter]
    player_so_far = batter_years[(batter_years['retroID'] == batter)
                                 & (batter_years['yearID'] <= year)]
    player_so_far = player_so_far.groupby('retroID').sum()
    features = ['G', 'AB', 'PA', 'R', 'H', '1B', '2B', '3B',
                'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB',
                'HBP', 'SH', 'SF', 'GIDP']
    for column in player[features]:
        player.iloc[0][column] = player_so_far.iloc[0][column]
    player_columns_to_drop = ['retroID', 'wOBA', 'Batting']
    player = player.drop(columns=player_columns_to_drop)
    return to_tensor_input(scaler, player, 'batting')


In [20]:
convert_single_player('bettm001', 2015, 'batting')

array([0.42623   , 0.3       , 0.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.16129032, 0.2288002 , 0.2671024 , 0.22673872,
       0.30697051, 0.13612565, 0.1824147 , 0.08961593, 0.07462687,
       0.14503518, 0.17866769, 0.03633721, 0.06666667, 0.01486989,
       0.25      , 0.10379747, 0.        , 0.21133094, 0.26473988])

In [21]:
gl.iloc[43]

visit_team                SFG
home_team                 SDP
visit_score                 1
home_score                  0
game_length_outs           72
night_game                  0
park_id                 SAN02
visit_manager_id     bochb002
home_manager_id      blacb001
visit_sp_id          hudst001
home_sp_id           kenni001
visit_player_1_id    aokin001
visit_player_2_id    panij002
visit_player_3_id    pagaa001
visit_player_4_id    poseb001
visit_player_5_id    crawb001
visit_player_6_id    mcgec001
visit_player_7_id    blang001
visit_player_8_id    ariaj001
visit_player_9_id    hudst001
home_player_1_id     myerw001
home_player_2_id     norrd001
home_player_3_id     kempm001
home_player_4_id     uptoj001
home_player_5_id     middw001
home_player_6_id     alony001
home_player_7_id     gyorj001
home_player_8_id     amara001
home_player_9_id     kenni001
year                     2015
month                       4
day                         9
home_win                    0
Name: 43, 

In [22]:
v1 = gl.iloc[0]['visit_player_1_id']

In [23]:
v1

'santd001'

In [24]:
visit_id = []
home_id = []

In [25]:
for i in range(1, 10):
    visit_id.append(gl.iloc[43]['visit_player_{}_id'.format(i)])
    home_id.append(gl.iloc[43]['home_player_{}_id'.format(i)])

In [26]:
visit_id

['aokin001',
 'panij002',
 'pagaa001',
 'poseb001',
 'crawb001',
 'mcgec001',
 'blang001',
 'ariaj001',
 'hudst001']

In [27]:
gl.iloc[0]['year']

2015

In [28]:
visit = []
home = []
year = gl.iloc[43]['year']
for index in range(0, 9):
    vrid = visit_id[index]
#     vpos = 'pitching' if vrid == gl.iloc[0]['visit_sp_id'] else 'batting'
    vplayer = convert_single_player(vrid, year, 'batting')
    visit.append(vplayer)
    hrid = home_id[index]
#     hpos = 'pitching' if hrid == gl.iloc[0]['home_sp_id'] else 'batting'
    hplayer = convert_single_player(hrid, year, 'batting')
    home.append(hplayer)

In [29]:
visit[0].shape

(30,)

In [30]:
home[0].shape

(30,)

In [31]:
gl.columns

Index(['visit_team', 'home_team', 'visit_score', 'home_score',
       'game_length_outs', 'night_game', 'park_id', 'visit_manager_id',
       'home_manager_id', 'visit_sp_id', 'home_sp_id', 'visit_player_1_id',
       'visit_player_2_id', 'visit_player_3_id', 'visit_player_4_id',
       'visit_player_5_id', 'visit_player_6_id', 'visit_player_7_id',
       'visit_player_8_id', 'visit_player_9_id', 'home_player_1_id',
       'home_player_2_id', 'home_player_3_id', 'home_player_4_id',
       'home_player_5_id', 'home_player_6_id', 'home_player_7_id',
       'home_player_8_id', 'home_player_9_id', 'year', 'month', 'day',
       'home_win'],
      dtype='object')

In [32]:
batters = visit + home

In [33]:
dfb = pd.DataFrame(batters, columns=columns['batting'])

In [34]:
dfb

Unnamed: 0,weight,height,pos_1B,pos_2B,pos_3B,pos_C,pos_OF,pos_P,pos_SS,bats_L,...,BB,SO,IBB,HBP,SH,SF,GIDP,NL,wRC+,WAR
0,0.42623,0.3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.091478,0.099345,0.00436,0.168421,0.111524,0.117188,0.124051,1.0,0.184353,0.105202
1,0.508197,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.085614,0.09742,0.020349,0.073684,0.048327,0.203125,0.129114,1.0,0.177158,0.103468
2,0.508197,0.55,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.124707,0.245668,0.026163,0.014035,0.078067,0.265625,0.136709,1.0,0.181655,0.157803
3,0.54918,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.18921,0.244128,0.090116,0.147368,0.003717,0.398438,0.374684,1.0,0.205036,0.354335
4,0.618852,0.55,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.15559,0.360801,0.09157,0.129825,0.022305,0.367188,0.235443,1.0,0.173561,0.172254
5,0.590164,0.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.097342,0.199076,0.020349,0.02807,0.0,0.242188,0.291139,1.0,0.171763,0.072832
6,0.454918,0.35,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.141908,0.254524,0.030523,0.052632,0.096654,0.117188,0.088608,1.0,0.173561,0.101734
7,0.446721,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.014464,0.057759,0.010174,0.02807,0.04461,0.070312,0.060759,1.0,0.158273,0.051445
8,0.405738,0.5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.010164,0.073161,0.0,0.007018,0.249071,0.015625,0.03038,1.0,0.095324,0.058382
9,0.528689,0.6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.122361,0.32268,0.020349,0.045614,0.003717,0.15625,0.179747,1.0,0.186151,0.104624


In [35]:
btensor = [dfb, gl.iloc[43]['home_win']]

In [36]:
# btensor

In [37]:
gl.shape

(2428, 33)

In [38]:
gl.shape[0]

2428

In [39]:
players['batting']['players']['retroID'].str.contains('aardd001').sum() == 1

True

<h4>Modular script to handle all gamelogs</h4>

In [40]:
cols = list(columns['batting'].values) + ['Result']
for year in range(1919, 2020):
    df = pd.DataFrame()
    print('{}'.format(year))
#     gl = pd.read_csv('../core/data/retrosheet/gamelogs/GL{}.csv'.format(year))
#     for index in range(0, gl.shape[0]):
#         visit_id = []
#         home_id = []
#         for i in range(1, 10):
#             visit_id.append(gl.iloc[index]['visit_player_{}_id'.format(i)])
#             home_id.append(gl.iloc[index]['home_player_{}_id'.format(i)])
#         visit = []
#         home = []
#         for i in range(0, 9):
#             vrid = visit_id[i]
#             vplayer = convert_single_player(vrid, year, 'batting')
#             visit.append(vplayer)
#             hrid = home_id[i]
#             hplayer = convert_single_player(hrid, year, 'batting')
#             home.append(hplayer)
#         batters = list(np.append(np.array(visit + home).flatten(), gl.iloc[index]['home_win']))
#         try:
#             bat_df = pd.DataFrame(batters)
#         except:
#             print('{0}\n{1}'.format(vrid))
#         df = df.append(bat_df.T)

#     if not os.path.exists('../core/tensors/games/'):
#         os.mkdir('../core/tensors/games/')
#     df.to_csv('../core/tensors/games/{0}.csv'.format(str(year)), index=False, header=None)

1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
