In [1]:
import pandas as pd
from datetime import datetime

In [2]:
df1 = pd.read_csv('../core/data/lahman/mlb_data/Fielding.csv')
df2 = pd.read_csv('../core/data/lahman/mlb_data/People.csv')

In [3]:
df1.columns

Index(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'POS', 'G', 'GS',
       'InnOuts', 'PO', 'A', 'E', 'DP', 'PB', 'WP', 'SB', 'CS', 'ZR'],
      dtype='object')

In [4]:
fielding_columns = ['playerID', 'POS']
df1 = df1[fielding_columns]

In [5]:
df2.columns

Index(['playerID', 'birthYear', 'birthMonth', 'birthDay', 'birthCountry',
       'birthState', 'birthCity', 'deathYear', 'deathMonth', 'deathDay',
       'deathCountry', 'deathState', 'deathCity', 'nameFirst', 'nameLast',
       'nameGiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalGame',
       'retroID', 'bbrefID'],
      dtype='object')

In [6]:
people_columns = ['playerID', 'nameFirst', 'nameLast', 'birthYear', 'bats', 'throws', 'weight', 'height', 'debut', 'finalGame']
df2 = df2[people_columns]

In [7]:
# df2.head()

In [8]:
string = list(df2[df2['playerID'] == 'omaraol01']['debut'])[0]

In [9]:
datetime.strptime(string, '%m/%d/%y').year

ValueError: time data '1912-09-08' does not match format '%m/%d/%y'

In [10]:
df2['debut'].fillna(0, inplace=True)
df2['finalGame'].fillna(0, inplace=True)

In [11]:
def get_debut_year(player):
    birth_year = player['birthYear']
    debut_as_string = str(player['debut'])
    try:
        debut_year = datetime.strptime(debut_as_string, '%m/%d/%y').year
    except ValueError:
        if debut_as_string == '0':
            debut_year = 0
        else:
            debut_year = datetime.strptime(debut_as_string, '%Y-%m-%d').year
    if debut_year - birth_year > 50:
        debut_year = debut_year - 100
    return debut_year

In [12]:
def get_final_year(player):
    debut_year = get_debut_year(player)
    final_as_string = str(player['finalGame'])
    try:
        final_year = datetime.strptime(final_as_string, '%m/%d/%y').year
    except ValueError:
        if final_as_string == '0':
            final_year = 0
        else:
            final_year = datetime.strptime(final_as_string, '%Y-%m-%d').year
    if final_year - debut_year > 50:
        final_year = final_year - 100
    return final_year

In [13]:
df2['debutYear'] = df2.apply(get_debut_year, axis=1)
df2['finalYear'] = df2.apply(get_final_year, axis=1)

In [14]:
df2.drop(columns=['debut', 'finalGame'], inplace=True)

In [15]:
# This will be exported to a separate module - retroid_dict.py
ids = pd.read_csv('../core/data/lahman/mlb_data/People.csv')
ids = ids[['playerID', 'retroID']]
id_dict = ids.set_index('playerID').to_dict()['retroID']

def get_retroid(id):
    return id_dict[id] if id_dict is not None else ''

In [16]:
df1['playerID'] = df1['playerID'].apply(get_retroid)
df2['playerID'] = df2['playerID'].apply(get_retroid)
df1.rename(columns={'playerID': 'retroID'}, inplace=True)
df2.rename(columns={'playerID': 'retroID'}, inplace=True)

In [17]:
df2[df2['retroID'] == 'puelc001']

Unnamed: 0,retroID,nameFirst,nameLast,birthYear,bats,throws,weight,height,debutYear,finalYear
14447,puelc001,Cesar,Puello,1991.0,R,R,220.0,74.0,2017,2019


In [18]:
df1 = df1.groupby('retroID').agg(lambda pos: pd.Series.mode(pos)[0])

In [19]:
df1 = df1.reset_index()

In [20]:
df2 = df2[df2['retroID'].notnull()]
# df2

In [21]:
df1.shape

(15293, 2)

In [22]:
df1[df1['retroID'].isin(df2['retroID'])].shape

(15293, 2)

In [23]:
df = pd.merge(df1, df2, on='retroID')

In [24]:
df

Unnamed: 0,retroID,POS,nameFirst,nameLast,birthYear,bats,throws,weight,height,debutYear,finalYear
0,aardd001,P,David,Aardsma,1981.0,R,R,215.0,75.0,2004,2015
1,aaroh101,OF,Hank,Aaron,1934.0,R,R,180.0,72.0,1954,1976
2,aarot101,1B,Tommie,Aaron,1939.0,R,R,190.0,75.0,1962,1971
3,aased001,P,Don,Aase,1954.0,R,R,190.0,75.0,1977,1990
4,abada001,1B,Andy,Abad,1972.0,L,L,184.0,73.0,2001,2006
...,...,...,...,...,...,...,...,...,...,...,...
15288,zupcb001,OF,Bob,Zupcic,1966.0,R,R,220.0,76.0,1991,1994
15289,zupof101,C,Frank,Zupo,1939.0,L,R,182.0,71.0,1957,1961
15290,zuveg101,P,George,Zuverink,1924.0,R,R,195.0,76.0,1951,1959
15291,zuvep001,SS,Paul,Zuvella,1958.0,R,R,173.0,72.0,1982,1991


In [25]:
df[df['bats'] == 0]

Unnamed: 0,retroID,POS,nameFirst,nameLast,birthYear,bats,throws,weight,height,debutYear,finalYear


In [26]:
df.isnull().any()

retroID      False
POS          False
nameFirst    False
nameLast     False
birthYear    False
bats          True
throws       False
weight        True
height        True
debutYear    False
finalYear    False
dtype: bool

In [29]:
mean_weight = df['weight'].mean()
mean_height = df['height'].mean()
df['weight'].fillna(mean_weight, inplace=True)
df['height'].fillna(mean_height, inplace=True)

In [30]:
df.isnull().any()

retroID      False
POS          False
nameFirst    False
nameLast     False
birthYear    False
bats          True
throws       False
weight       False
height       False
debutYear    False
finalYear    False
dtype: bool