In [1]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
df = pd.read_csv('../core/data/lahman/mlb_data/Fielding.csv').sort_values('playerID')

In [4]:
# This will be exported to a separate module
ids = pd.read_csv('../core/data/lahman/mlb_data/People.csv')
ids = ids[['playerID', 'retroID']]
id_dict = ids.set_index('playerID').to_dict()['retroID']

def get_retroid(id):
    return id_dict[id] if id_dict is not None else id

In [5]:
df['playerID'] = df['playerID'].apply(get_retroid)
df.rename(columns={'playerID': 'retroID'}, inplace=True)

<h3>Exploration</h3>

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112837 entries, 85308 to 106797
Data columns (total 18 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   retroID  112837 non-null  object 
 1   yearID   112837 non-null  int64  
 2   stint    112837 non-null  int64  
 3   teamID   112837 non-null  object 
 4   lgID     112837 non-null  object 
 5   POS      112837 non-null  object 
 6   G        112837 non-null  int64  
 7   GS       89431 non-null   float64
 8   InnOuts  89431 non-null   float64
 9   PO       112837 non-null  int64  
 10  A        112837 non-null  int64  
 11  E        112836 non-null  float64
 12  DP       112837 non-null  int64  
 13  PB       8538 non-null    float64
 14  WP       1169 non-null    float64
 15  SB       6389 non-null    float64
 16  CS       6389 non-null    float64
 17  ZR       1169 non-null    float64
dtypes: float64(8), int64(6), object(4)
memory usage: 16.4+ MB


In [7]:
df.shape

(112837, 18)

In [8]:
df.columns

Index(['retroID', 'yearID', 'stint', 'teamID', 'lgID', 'POS', 'G', 'GS',
       'InnOuts', 'PO', 'A', 'E', 'DP', 'PB', 'WP', 'SB', 'CS', 'ZR'],
      dtype='object')

<p>We want to get rid of columns which already exist in the Batting DataFrame (with which we will be merging this)<p>

In [9]:
columns_to_drop = ['stint', 'teamID', 'lgID', 'G']

In [10]:
df.drop(columns=columns_to_drop, inplace=True)

In [11]:
df.head()

Unnamed: 0,retroID,yearID,POS,GS,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR
85308,aardd001,2004,P,0.0,32.0,0,0,0.0,0,,,,,
101187,aardd001,2013,P,0.0,119.0,1,5,0.0,0,,,,,
99344,aardd001,2012,P,0.0,3.0,0,0,0.0,0,,,,,
95793,aardd001,2010,P,0.0,149.0,2,3,1.0,0,,,,,
104866,aardd001,2015,P,0.0,92.0,0,1,1.0,0,,,,,


<h3>Cleaning and Preprocessing</h3>

<p>We see a lot of NaNs in the last 5 columns. According to the Lahman readme, these are:</p>

 - PB - Passed Balls (by catchers)

 - WP - Wild Pitches (by catchers)

 - SB - Opponent Stolen Bases (by catchers)

 - CS - Opponents Caught Stealing (by catchers)

 - ZR - Zone Rating

<p>It looks like the data demands that we treat catchers separately from other position players. This intuitively makes sense from what we know about baseball, and it saves us from getting rid of a lot of data. First, though, let's look at how much of that data is missing if we JUST look at catchers.</p>

In [12]:
df_catchers = df[df['POS'] == 'C']

In [13]:
# Get missing data in the catchers category as a percentage
100 * df_catchers.isnull().sum() / len(df)

retroID    0.000000
yearID     0.000000
POS        0.000000
GS         1.901858
InnOuts    1.901858
PO         0.000000
A          0.000000
E          0.000000
DP         0.000000
PB         0.000000
WP         6.530659
SB         1.904517
CS         1.904517
ZR         6.530659
dtype: float64

<p>Most of the percentages are negligable, but we can take a look at WP and ZR and see if the missing data is from early years.</p>

In [14]:
early_catchers = df_catchers[df_catchers['yearID'] < 1955]

In [15]:
100 * early_catchers.isnull().sum() / len(df)

retroID    0.000000
yearID     0.000000
POS        0.000000
GS         1.901858
InnOuts    1.901858
PO         0.000000
A          0.000000
E          0.000000
DP         0.000000
PB         0.000000
WP         1.901858
SB         1.901858
CS         1.901858
ZR         1.901858
dtype: float64

<p>Definitely not the case. Let's try to narrow down where the issue is.</p>

In [16]:
post1985_catchers = df_catchers[df_catchers['yearID'] > 1985]

In [17]:
100 * post1985_catchers.isnull().sum() / len(df)

retroID    0.000000
yearID     0.000000
POS        0.000000
GS         0.000000
InnOuts    0.000000
PO         0.000000
A          0.000000
E          0.000000
DP         0.000000
PB         0.000000
WP         3.265773
SB         0.000000
CS         0.000000
ZR         3.265773
dtype: float64

In [18]:
df_1955_to_1986_catchers = df_catchers[(df_catchers['yearID'] >= 1955) & (df_catchers['yearID'] <= 1985)]

In [19]:
100 * df_1955_to_1986_catchers.isnull().sum() / len(df)

retroID    0.000000
yearID     0.000000
POS        0.000000
GS         0.000000
InnOuts    0.000000
PO         0.000000
A          0.000000
E          0.000000
DP         0.000000
PB         0.000000
WP         1.363028
SB         0.002659
CS         0.002659
ZR         1.363028
dtype: float64

In [20]:
pre_1930_catchers = df_catchers[df_catchers['yearID'] < 1930]

In [21]:
100 * pre_1930_catchers.isnull().sum() / len(df)

retroID    0.000000
yearID     0.000000
POS        0.000000
GS         0.591118
InnOuts    0.591118
PO         0.000000
A          0.000000
E          0.000000
DP         0.000000
PB         0.000000
WP         0.591118
SB         0.591118
CS         0.591118
ZR         0.591118
dtype: float64

<p>We see that the issue is mainly in the very early years, and we are fine with dropping that information by just filling it in as we did in the Batters table.</p>

<p>So with that, we are fine with filling all NA values with 0.</p>

In [22]:
df_catchers['GS'].fillna(value=0, inplace=True)
df_catchers['InnOuts'].fillna(value=0, inplace=True)
df_catchers['WP'].fillna(value=0, inplace=True)
df_catchers['SB'].fillna(value=0, inplace=True)
df_catchers['CS'].fillna(value=0, inplace=True)
df_catchers['ZR'].fillna(value=0, inplace=True)

In [23]:
df['GS'].fillna(value=0, inplace=True)
df['InnOuts'].fillna(value=0, inplace=True)
#We can just drop the catcher-related columns from the original dataframe, as we will also drop all catcher rows
catcher_columns = ['PB', 'WP', 'SB', 'CS', 'ZR']
df.drop(columns=catcher_columns, inplace=True)

<p>Now drop all catcher rows so we have two separate dataframes, and get rid of the yearID column which we're done with and will be useless after aggregation.</p>

In [24]:
df = df[df['POS'] != 'C']

In [25]:
df.drop(columns=['yearID'], inplace=True)
df_catchers.drop(columns=['yearID'], inplace=True)

In [26]:
df.shape

(104299, 8)

In [27]:
df_catchers.shape

(8538, 13)

In [28]:
100 * df.isnull().sum() / len(df)

retroID    0.000000
POS        0.000000
GS         0.000000
InnOuts    0.000000
PO         0.000000
A          0.000000
E          0.000959
DP         0.000000
dtype: float64

<p>Now we just see a little bit of information missing from Errors, so we can fill that with 0s no problem.</p>

In [29]:
df['E'].fillna(value=0, inplace=True)

In [30]:
100 * df.isnull().sum() / len(df)

retroID    0.0
POS        0.0
GS         0.0
InnOuts    0.0
PO         0.0
A          0.0
E          0.0
DP         0.0
dtype: float64

In [31]:
100 * df_catchers.isnull().sum() / len(df)

retroID    0.0
POS        0.0
GS         0.0
InnOuts    0.0
PO         0.0
A          0.0
E          0.0
DP         0.0
PB         0.0
WP         0.0
SB         0.0
CS         0.0
ZR         0.0
dtype: float64

<p>At this point we have essentially the same data as what we had in Fielding.csv from Lahman, but we have cleaned the data and gotten rid of missing values. Before we groupby and take out metadata, it would be a good idea to save this to a .csv so that we can later get it as a DataFrame and not worry about missing values.</p>

In [33]:
df.to_csv('../core/output/fielding.csv')
df_catchers.to_csv('../core/output/catching.csv')

<h3>Aggregation</h3>

<p>Now we just need to aggregate all stats to get total career numbers for each player.</p>

In [77]:
df = df.groupby('retroID').sum().reset_index()

In [78]:
df_catchers = df_catchers.groupby('retroID').sum().reset_index()

In [79]:
df

Unnamed: 0,retroID,GS,InnOuts,PO,A,E,DP
0,aardd001,0.0,1011.0,11,29,3.0,2
1,aaroh101,2977.0,78414.0,7436,429,144.0,218
2,aarot101,206.0,6472.0,1317,113,22.0,124
3,aased001,91.0,3328.0,67,135,13.0,10
4,abada001,4.0,138.0,37,1,1.0,3
...,...,...,...,...,...,...,...
14222,zumaj001,0.0,629.0,7,14,2.0,1
14223,zupcb001,198.0,5842.0,483,22,12.0,5
14224,zuveg101,31.0,1847.0,45,145,7.0,10
14225,zuvep001,136.0,3844.0,267,415,23.0,84


In [80]:
df_catchers

Unnamed: 0,retroID,GS,InnOuts,PO,A,E,DP,PB,WP,SB,CS,ZR
0,adamb105,1.0,27.0,6,0,0.0,0,0.0,0.0,1.0,0.0,0.0
1,adamb106,0.0,0.0,249,90,12.0,15,7.0,0.0,0.0,0.0,0.0
2,adamd101,3.0,78.0,9,2,0.0,0,1.0,0.0,0.0,0.0,0.0
3,adled101,65.0,1840.0,453,26,4.0,2,8.0,19.0,37.0,16.0,0.0
4,afent001,20.0,613.0,123,5,1.0,3,6.0,0.0,17.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1524,zimmd101,27.0,744.0,150,18,6.0,1,5.0,12.0,10.0,10.0,3.0
1525,zimmj101,298.0,8560.0,2131,150,21.0,26,19.0,84.0,110.0,80.0,4.0
1526,zinta001,0.0,3.0,2,0,0.0,0,0.0,0.0,0.0,0.0,0.0
1527,zunim001,535.0,14489.0,4356,264,21.0,22,39.0,0.0,248.0,98.0,0.0
