In [1]:
import pandas as pd
from datetime import datetime

In [2]:
df1 = pd.read_csv('../core/data/lahman/mlb_data/Fielding.csv')
df2 = pd.read_csv('../core/data/lahman/mlb_data/People.csv')

In [3]:
df1.columns

Index(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'POS', 'G', 'GS',
       'InnOuts', 'PO', 'A', 'E', 'DP', 'PB', 'WP', 'SB', 'CS', 'ZR'],
      dtype='object')

In [4]:
fielding_columns = ['playerID', 'POS']
df1 = df1[fielding_columns]

In [5]:
df2.columns

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID'],
      dtype='object')

In [6]:
people_columns = ['playerID', 'birthYear', 'bats', 'throws', 'weight', 'height', 'debut', 'finalGame']
df2 = df2[people_columns]

In [7]:
df2.head()

Unnamed: 0,playerID,birthYear,bats,throws,weight,height,debut,finalGame
0,omaraol01,1891.0,R,R,155.0,69.0,9/8/12,4/19/19
1,steelbo01,1894.0,B,L,175.0,70.0,4/17/16,4/24/19
2,fishebo01,1886.0,R,R,170.0,69.0,6/3/12,4/25/19
3,kanefr01,1895.0,L,R,175.0,71.0,9/13/15,4/28/19
4,kingle01,1894.0,R,R,150.0,70.0,6/24/16,4/28/19


In [8]:
string = list(df2[df2['playerID'] == 'omaraol01']['debut'])[0]

In [9]:
datetime.strptime(string, '%m/%d/%y').year

2012

In [10]:
df2['debut'].fillna(0, inplace=True)
df2['finalGame'].fillna(0, inplace=True)

In [11]:
def get_debut_year(player):
    birth_year = player['birthYear']
    debut_as_string = str(player['debut'])
    try:
        debut_year = datetime.strptime(debut_as_string, '%m/%d/%y').year
    except ValueError:
        if debut_as_string == '0':
            debut_year = 0
        else:
            debut_year = datetime.strptime(debut_as_string, '%Y-%m-%d').year
    if debut_year - birth_year > 50:
        debut_year = debut_year - 100
    return debut_year

In [12]:
def get_final_year(player):
    debut_year = get_debut_year(player)
    final_as_string = str(player['finalGame'])
    try:
        final_year = datetime.strptime(final_as_string, '%m/%d/%y').year
    except ValueError:
        if final_as_string == '0':
            final_year = 0
        else:
            final_year = datetime.strptime(final_as_string, '%Y-%m-%d').year
    if final_year - debut_year > 50:
        final_year = final_year - 100
    return final_year

In [13]:
df2['debutYear'] = df2.apply(get_debut_year, axis=1)
df2['finalYear'] = df2.apply(get_final_year, axis=1)

In [14]:
df2[df2['birthYear'] == 1995]

Unnamed: 0,playerID,birthYear,bats,throws,weight,height,debut,finalGame,debutYear,finalYear
13594,herrero01,1995.0,R,R,185.0,71.0,6/14/17,6/29/17,2017,2017
13765,cordoal01,1995.0,R,R,175.0,73.0,4/3/17,10/1/17,2017,2017
13777,haysau01,1995.0,R,R,195.0,73.0,9/7/17,10/1/17,2017,2017
13788,martefr01,1995.0,R,R,225.0,73.0,6/9/17,10/1/17,2017,2017
13908,bautige01,1995.0,R,R,195.0,75.0,4/17/18,6/2/18,2018,2018
13931,arroych01,1995.0,R,R,180.0,73.0,4/24/17,6/15/18,2017,2018
14021,williju02,1995.0,L,R,215.0,74.0,7/21/18,7/21/18,2018,2018
14078,pareded01,1995.0,R,R,230.0,73.0,6/23/17,8/12/18,2017,2018
14079,gonzame01,1995.0,R,R,216.0,72.0,4/19/18,8/13/18,2018,2018
14129,nottija01,1995.0,R,R,230.0,74.0,4/16/18,9/4/18,2018,2018


In [15]:
df2.drop(columns=['debut', 'finalGame'], inplace=True)

In [16]:
# This will be exported to a separate module - retroid_dict.py
ids = pd.read_csv('../core/data/lahman/mlb_data/People.csv')
ids = ids[['playerID', 'retroID']]
id_dict = ids.set_index('playerID').to_dict()['retroID']

def get_retroid(id):
    return id_dict[id] if id_dict is not None else ''

In [17]:
df1['playerID'] = df1['playerID'].apply(get_retroid)
df2['playerID'] = df2['playerID'].apply(get_retroid)
df1.rename(columns={'playerID': 'retroID'}, inplace=True)
df2.rename(columns={'playerID': 'retroID'}, inplace=True)

In [18]:
df1 = df1.groupby('retroID').agg(lambda pos: pd.Series.mode(pos)[0])

In [19]:
df1 = df1.reset_index()
df1

Unnamed: 0,retroID,POS
0,aardd001,P
1,aaroh101,OF
2,aarot101,1B
3,aased001,P
4,abada001,1B
...,...,...
15026,zupcb001,OF
15027,zupof101,C
15028,zuveg101,P
15029,zuvep001,SS


In [20]:
df2 = df2[df2['retroID'].notnull()]

In [21]:
df2

Unnamed: 0,retroID,birthYear,bats,throws,weight,height,debutYear,finalYear
0,omaro101,1891.0,R,R,155.0,69.0,1912,1919
1,steeb103,1894.0,B,L,175.0,70.0,1916,1919
2,fisht103,1886.0,R,R,170.0,69.0,1912,1919
3,kanef101,1895.0,L,R,175.0,71.0,1915,1919
4,kingl101,1894.0,R,R,150.0,70.0,1916,1919
...,...,...,...,...,...,...,...,...
17293,willj801,1847.0,,,,,0,0
17296,winkb801,1930.0,R,R,168.0,69.0,0,0
17297,wriga801,1842.0,,,,,0,0
17299,younn801,1840.0,,,,,0,0


In [22]:
df1.shape

(15031, 2)

In [23]:
df1[df1['retroID'].isin(df2['retroID'])].shape

(15031, 2)

In [24]:
df1.head()

Unnamed: 0,retroID,POS
0,aardd001,P
1,aaroh101,OF
2,aarot101,1B
3,aased001,P
4,abada001,1B


In [25]:
df2.head()

Unnamed: 0,retroID,birthYear,bats,throws,weight,height,debutYear,finalYear
0,omaro101,1891.0,R,R,155.0,69.0,1912,1919
1,steeb103,1894.0,B,L,175.0,70.0,1916,1919
2,fisht103,1886.0,R,R,170.0,69.0,1912,1919
3,kanef101,1895.0,L,R,175.0,71.0,1915,1919
4,kingl101,1894.0,R,R,150.0,70.0,1916,1919


In [26]:
df2[df2['retroID'] == 'aardd001']

Unnamed: 0,retroID,birthYear,bats,throws,weight,height,debutYear,finalYear
13110,aardd001,1981.0,R,R,215.0,75.0,2004,2015


In [27]:
df = pd.merge(df1, df2, on='retroID')

In [28]:
df

Unnamed: 0,retroID,POS,birthYear,bats,throws,weight,height,debutYear,finalYear
0,aardd001,P,1981.0,R,R,215.0,75.0,2004,2015
1,aaroh101,OF,1934.0,R,R,180.0,72.0,1954,1976
2,aarot101,1B,1939.0,R,R,190.0,75.0,1962,1971
3,aased001,P,1954.0,R,R,190.0,75.0,1977,1990
4,abada001,1B,1972.0,L,L,184.0,73.0,2001,2006
...,...,...,...,...,...,...,...,...,...
15026,zupcb001,OF,1966.0,R,R,220.0,76.0,1991,1994
15027,zupof101,C,1939.0,L,R,182.0,71.0,1957,1961
15028,zuveg101,P,1924.0,R,R,195.0,76.0,1951,1959
15029,zuvep001,SS,1958.0,R,R,173.0,72.0,1982,1991
