In [969]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# We're going to be reassigning some columns, so we'll turn off this warning - we know what we're doing!
pd.options.mode.chained_assignment = None  # default='warn'

In [970]:
df = pd.read_csv('./data/lahman/mlb_data/Batting.csv').sort_values('playerID')

In [971]:
df.rename(columns={'playerID': 'retroID'}, inplace=True)

<h1>Cleaning the Data - Missing Values</h1>

<h4>Print percentages of missing data in each column of the batting table</h4>

In [972]:
100 * df.isnull().sum() / len(df)

retroID     0.000000
yearID      0.000000
stint       0.000000
teamID      0.000000
lgID        0.000000
G           0.000000
AB          0.000000
R           0.000000
H           0.000000
2B          0.000000
3B          0.000000
HR          0.000000
RBI         0.000000
SB          0.000000
CS          8.221708
BB          0.000000
SO          0.000000
IBB        21.711883
HBP         0.000000
SH          0.000000
SF         21.090864
GIDP        9.839985
dtype: float64

<p>Since this data is by season, it's likely that we have entries for a player for one season with no data in these fields but there is data for other seasons. Since we're taking aggregate sums for each player, we have two options: set these null values to zero so they don't add to the sum, or set them to the average for that player. We'll have to test the theory to see which is more viable.</p>

<p>We're going to start with IBB rather than CS, since it's a more significant chunk of the dataset.</p>

<h3>Handling missing IBB data</h3>

In [973]:
df[(df['IBB'].isnull())]

Unnamed: 0,retroID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
19269,aaronha01,1954,1,ML1,NL,122,468,58,131,27,...,69,2,2.0,28,39,,3,6,4.0,13.0
18684,aberal01,1953,2,DET,AL,17,23,2,3,0,...,2,0,0.0,1,6,,0,1,,0.0
16858,aberal01,1950,1,CLE,AL,1,2,0,0,0,...,0,0,0.0,1,1,,0,0,,0.0
19270,aberal01,1954,1,DET,AL,32,39,3,5,0,...,3,0,0.0,2,17,,0,3,1.0,1.0
18683,aberal01,1953,1,CLE,AL,6,0,0,0,0,...,0,0,0.0,2,0,,0,0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15128,zuberbi01,1946,1,NYA,AL,3,2,0,0,0,...,0,0,0.0,0,1,,0,0,,0.0
14447,zuberbi01,1945,1,NYA,AL,21,42,1,7,0,...,3,0,0.0,1,13,,0,2,,1.0
13868,zuberbi01,1944,1,NYA,AL,22,31,1,4,0,...,1,0,0.0,0,10,,0,4,,1.0
19843,zuverge01,1954,1,CIN,NL,2,2,1,1,0,...,0,0,0.0,0,1,,0,0,0.0,0.0


In [974]:
df[(df['retroID'] == 'aberal01')]

Unnamed: 0,retroID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
19846,aberal01,1955,1,DET,AL,39,17,0,1,0,...,0,0,0.0,0,9,0.0,0,2,0.0,1.0
18684,aberal01,1953,2,DET,AL,17,23,2,3,0,...,2,0,0.0,1,6,,0,1,,0.0
16858,aberal01,1950,1,CLE,AL,1,2,0,0,0,...,0,0,0.0,1,1,,0,0,,0.0
19270,aberal01,1954,1,DET,AL,32,39,3,5,0,...,3,0,0.0,2,17,,0,3,1.0,1.0
18683,aberal01,1953,1,CLE,AL,6,0,0,0,0,...,0,0,0.0,2,0,,0,0,,0.0
21123,aberal01,1957,2,KC1,AL,3,1,0,1,0,...,0,0,0.0,0,0,0.0,0,0,0.0,0.0
20501,aberal01,1956,1,DET,AL,42,10,0,3,0,...,0,0,0.0,1,4,0.0,0,2,0.0,0.0
21122,aberal01,1957,1,DET,AL,28,8,0,1,0,...,1,0,0.0,1,4,0.0,0,0,0.0,0.0


In [975]:
df[(df['retroID'] == 'zuberbi01')]

Unnamed: 0,retroID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
9445,zuberbi01,1936,1,CLE,AL,2,5,1,1,0,...,0,0,0.0,0,1,,0,0,,
10501,zuberbi01,1938,1,CLE,AL,15,7,0,0,0,...,0,0,0.0,0,1,,0,1,,
11080,zuberbi01,1939,1,CLE,AL,16,5,0,1,0,...,0,0,0.0,0,2,,0,0,,0.0
11621,zuberbi01,1940,1,CLE,AL,17,3,0,1,0,...,0,0,0.0,0,0,,0,0,,0.0
12203,zuberbi01,1941,1,WS1,AL,36,26,0,0,0,...,0,0,0.0,1,8,,0,2,,0.0
12742,zuberbi01,1942,1,WS1,AL,37,39,5,6,3,...,3,0,0.0,1,7,,0,3,,2.0
13299,zuberbi01,1943,1,NYA,AL,20,38,1,7,1,...,2,0,0.0,4,14,,0,5,,2.0
15711,zuberbi01,1947,1,BOS,AL,20,13,0,2,0,...,0,0,0.0,2,3,,0,2,,2.0
15129,zuberbi01,1946,2,BOS,AL,15,18,1,2,0,...,2,0,0.0,1,6,,0,1,,0.0
15128,zuberbi01,1946,1,NYA,AL,3,2,0,0,0,...,0,0,0.0,0,1,,0,0,,0.0


<p>First let's look at IBB, intentional bases on balls. It seems like most of the missing data is from early in the dataset - it could be that IBB was not recorded then, and/or not considered a trackable play?</p>

In [976]:
df[(df['IBB'].isnull())]['yearID'].max()

1954

In [977]:
df[(df['IBB'].isnull())]['yearID'].min()

1919

In [978]:
df[(df['IBB'].isnull())]

Unnamed: 0,retroID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
19269,aaronha01,1954,1,ML1,NL,122,468,58,131,27,...,69,2,2.0,28,39,,3,6,4.0,13.0
18684,aberal01,1953,2,DET,AL,17,23,2,3,0,...,2,0,0.0,1,6,,0,1,,0.0
16858,aberal01,1950,1,CLE,AL,1,2,0,0,0,...,0,0,0.0,1,1,,0,0,,0.0
19270,aberal01,1954,1,DET,AL,32,39,3,5,0,...,3,0,0.0,2,17,,0,3,1.0,1.0
18683,aberal01,1953,1,CLE,AL,6,0,0,0,0,...,0,0,0.0,2,0,,0,0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15128,zuberbi01,1946,1,NYA,AL,3,2,0,0,0,...,0,0,0.0,0,1,,0,0,,0.0
14447,zuberbi01,1945,1,NYA,AL,21,42,1,7,0,...,3,0,0.0,1,13,,0,2,,1.0
13868,zuberbi01,1944,1,NYA,AL,22,31,1,4,0,...,1,0,0.0,0,10,,0,4,,1.0
19843,zuverge01,1954,1,CIN,NL,2,2,1,1,0,...,0,0,0.0,0,1,,0,0,0.0,0.0


<p>We have 19159 total rows where there is no data for IBB, and we know none of those rows goes past the year 1954...</p>

In [979]:
df[(df['yearID'] < 1955)]

Unnamed: 0,retroID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
19269,aaronha01,1954,1,ML1,NL,122,468,58,131,27,...,69,2,2.0,28,39,,3,6,4.0,13.0
18684,aberal01,1953,2,DET,AL,17,23,2,3,0,...,2,0,0.0,1,6,,0,1,,0.0
16858,aberal01,1950,1,CLE,AL,1,2,0,0,0,...,0,0,0.0,1,1,,0,0,,0.0
19270,aberal01,1954,1,DET,AL,32,39,3,5,0,...,3,0,0.0,2,17,,0,3,1.0,1.0
18683,aberal01,1953,1,CLE,AL,6,0,0,0,0,...,0,0,0.0,2,0,,0,0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13868,zuberbi01,1944,1,NYA,AL,22,31,1,4,0,...,1,0,0.0,0,10,,0,4,,1.0
18682,zuverge01,1952,1,CLE,AL,2,0,1,0,0,...,0,0,0.0,0,0,0.0,0,0,0.0,0.0
19843,zuverge01,1954,1,CIN,NL,2,2,1,1,0,...,0,0,0.0,0,1,,0,0,0.0,0.0
19844,zuverge01,1954,2,DET,AL,35,64,1,8,1,...,3,0,1.0,1,14,,0,9,0.0,2.0


<p>And we have 19845 total rows up to the year 1954. That means...</p>

In [980]:
19159 / 19845

0.9654320987654321

<p>Over 96% of the data before 1955 is missing IBB. I think this gives justification to just setting all of those NaNs to 0.</p>

In [981]:
df['IBB'].fillna(value=0, inplace=True)

In [982]:
100 * df.isnull().sum() / len(df)

retroID     0.000000
yearID      0.000000
stint       0.000000
teamID      0.000000
lgID        0.000000
G           0.000000
AB          0.000000
R           0.000000
H           0.000000
2B          0.000000
3B          0.000000
HR          0.000000
RBI         0.000000
SB          0.000000
CS          8.221708
BB          0.000000
SO          0.000000
IBB         0.000000
HBP         0.000000
SH          0.000000
SF         21.090864
GIDP        9.839985
dtype: float64

<p>Our IBB issue is solved. Let's move on to SF (sacrifice flies). We'll check the years and rows again to see if we're justified in using the same method to eliminate nulls.</p>

<h3>Handling missing SF data</h3>

In [983]:
df[(df['SF'].isnull())]['yearID'].max()

1953

In [984]:
df[(df['SF'].isnull())]['yearID'].min()

1919

In [985]:
df[(df['SF'].isnull())].shape[0]

18611

In [986]:
df[(df['yearID'] < 1954)].shape[0]

19269

In [987]:
18611/19269

0.965851886449738

<p>Almost the same percentage, and one less year covered. I think we can fill those missing values with 0.</p>

In [988]:
df['SF'].fillna(value=0, inplace=True)

In [989]:
100 * df.isnull().sum() / len(df)

retroID    0.000000
yearID     0.000000
stint      0.000000
teamID     0.000000
lgID       0.000000
G          0.000000
AB         0.000000
R          0.000000
H          0.000000
2B         0.000000
3B         0.000000
HR         0.000000
RBI        0.000000
SB         0.000000
CS         8.221708
BB         0.000000
SO         0.000000
IBB        0.000000
HBP        0.000000
SH         0.000000
SF         0.000000
GIDP       9.839985
dtype: float64

<p>Two more to go, let's move on to CS (caught stealing)</p>

<h3>Handling missing CS data</h3>

In [990]:
df[(df['CS'].isnull())]['yearID'].max()

1950

In [991]:
df[(df['CS'].isnull())]['yearID'].min()

1919

In [992]:
df[(df['CS'].isnull())]

Unnamed: 0,retroID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
14448,abernwo01,1946,1,NY1,NL,15,8,0,0,0,...,0,0,,0,4,0.0,0,0,0.0,0.0
16285,aberscl01,1949,1,CHN,NL,4,7,0,0,0,...,0,0,,0,2,0.0,0,0,0.0,1.0
15712,aberscl01,1948,1,CHN,NL,12,32,1,6,1,...,6,0,,5,10,0.0,0,0,0.0,0.0
15131,aberscl01,1947,1,CHN,NL,47,140,24,39,6,...,20,0,,20,32,0.0,0,0,0.0,5.0
16286,abramca01,1949,1,BRO,NL,8,24,6,2,1,...,0,1,,7,6,0.0,0,0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532,zitzmbi01,1919,1,PIT,NL,11,26,5,5,1,...,2,2,,0,6,0.0,0,1,0.0,
4782,zitzmbi01,1927,1,CIN,NL,88,232,47,66,10,...,24,9,,20,18,0.0,4,17,0.0,
5312,zitzmbi01,1928,1,CIN,NL,101,266,53,79,9,...,33,13,,13,22,0.0,3,14,0.0,
533,zitzmbi01,1919,2,CIN,NL,2,1,0,0,0,...,0,0,,0,0,0.0,0,0,0.0,


In [993]:
df[(df['retroID'] == 'zitzmbi01')]

Unnamed: 0,retroID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
4241,zitzmbi01,1926,1,CIN,NL,53,94,21,23,2,...,3,3,,6,7,0.0,2,3,0.0,
532,zitzmbi01,1919,1,PIT,NL,11,26,5,5,1,...,2,2,,0,6,0.0,0,1,0.0,
3715,zitzmbi01,1925,1,CIN,NL,104,301,53,76,13,...,21,11,11.0,35,22,0.0,6,2,0.0,
4782,zitzmbi01,1927,1,CIN,NL,88,232,47,66,10,...,24,9,,20,18,0.0,4,17,0.0,
5312,zitzmbi01,1928,1,CIN,NL,101,266,53,79,9,...,33,13,,13,22,0.0,3,14,0.0,
533,zitzmbi01,1919,2,CIN,NL,2,1,0,0,0,...,0,0,,0,0,0.0,0,0,0.0,
5842,zitzmbi01,1929,1,CIN,NL,47,84,18,19,3,...,6,4,,9,10,0.0,1,2,0.0,


In [994]:
df[(df['CS'].isnull())].shape[0]

7255

In [995]:
df[(df['yearID'] < 1951)].shape[0]

17435

In [996]:
7255/17435

0.4161170060223688

<p>There isn't a great solution for this. If we drop all missing rows with NaN for CS, we're going to lose over 41% of the data prior to 1951. It doesn't encompass enough of the data to just fill in values like we did before, we can't drop rows, and we don't want to drop the column since it isn't missing any data after 1950. One idea, and this may be controversial, is to find the average ratio between SB (stolen bases) and CS and fill in with values based on that ratio.</p>

<p>First, we'll get all data without missing CS values<p>

In [997]:
df_temp = df[(df['CS'].notnull())]
# df_temp

In [998]:
total_sb = df_temp['SB'].sum()
total_sb

182622

In [999]:
total_cs = df_temp['CS'].sum()
total_cs

94186.0

In [1000]:
total_sb/total_cs

1.9389505871360924

<p>So on average, players are almost twice as likely to steal a base as they are to get caught. This is easy math that we're going to round to make it even easier. It's probably not the best method of solving this issue but at least we still have over 60 years of clean data!</p>

In [1001]:
df[(df['CS'].isnull())]

Unnamed: 0,retroID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
14448,abernwo01,1946,1,NY1,NL,15,8,0,0,0,...,0,0,,0,4,0.0,0,0,0.0,0.0
16285,aberscl01,1949,1,CHN,NL,4,7,0,0,0,...,0,0,,0,2,0.0,0,0,0.0,1.0
15712,aberscl01,1948,1,CHN,NL,12,32,1,6,1,...,6,0,,5,10,0.0,0,0,0.0,0.0
15131,aberscl01,1947,1,CHN,NL,47,140,24,39,6,...,20,0,,20,32,0.0,0,0,0.0,5.0
16286,abramca01,1949,1,BRO,NL,8,24,6,2,1,...,0,1,,7,6,0.0,0,0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532,zitzmbi01,1919,1,PIT,NL,11,26,5,5,1,...,2,2,,0,6,0.0,0,1,0.0,
4782,zitzmbi01,1927,1,CIN,NL,88,232,47,66,10,...,24,9,,20,18,0.0,4,17,0.0,
5312,zitzmbi01,1928,1,CIN,NL,101,266,53,79,9,...,33,13,,13,22,0.0,3,14,0.0,
533,zitzmbi01,1919,2,CIN,NL,2,1,0,0,0,...,0,0,,0,0,0.0,0,0,0.0,


In [1002]:
df[(df['CS']).isnull()].apply(lambda x: x['SB'] / 2, axis=1)

14448    0.0
16285    0.0
15712    0.0
15131    0.0
16286    0.5
        ... 
532      1.0
4782     4.5
5312     6.5
533      0.0
5842     2.0
Length: 7255, dtype: float64

In [1003]:
df[(df['CS']).isnull()].apply(lambda x: x['SB'] / 2, axis=1).value_counts()

0.0     4494
0.5      794
1.0      438
1.5      303
2.0      257
2.5      165
3.0      139
3.5      131
4.0       91
4.5       81
5.0       51
5.5       50
6.5       38
6.0       36
7.5       28
7.0       22
8.0       21
9.0       19
8.5       16
9.5       11
11.5       8
10.0       8
10.5       8
11.0       7
13.0       6
14.0       5
12.0       5
14.5       3
18.5       3
12.5       2
17.5       2
13.5       2
16.5       2
16.0       2
20.0       1
15.0       1
15.5       1
24.0       1
18.0       1
17.0       1
21.5       1
dtype: int64

<p>I don't love the max of 24, but overall these values look good and we definitely don't have many of the higher values. So we're going to apply this to our missing CS data</p>

<p>First I'm going to test it out on a copy</p>

In [1004]:
df_temp = df[(df['CS']).isnull()]

In [1005]:
df_temp['CS'] = df_temp.apply(lambda x: x['SB'] / 2, axis=1)

In [1006]:
df_temp[(df_temp['retroID'] == 'zitzmbi01')]

Unnamed: 0,retroID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
4241,zitzmbi01,1926,1,CIN,NL,53,94,21,23,2,...,3,3,1.5,6,7,0.0,2,3,0.0,
532,zitzmbi01,1919,1,PIT,NL,11,26,5,5,1,...,2,2,1.0,0,6,0.0,0,1,0.0,
4782,zitzmbi01,1927,1,CIN,NL,88,232,47,66,10,...,24,9,4.5,20,18,0.0,4,17,0.0,
5312,zitzmbi01,1928,1,CIN,NL,101,266,53,79,9,...,33,13,6.5,13,22,0.0,3,14,0.0,
533,zitzmbi01,1919,2,CIN,NL,2,1,0,0,0,...,0,0,0.0,0,0,0.0,0,0,0.0,
5842,zitzmbi01,1929,1,CIN,NL,47,84,18,19,3,...,6,4,2.0,9,10,0.0,1,2,0.0,


<p>We know from before that this guy had NaNs for his CS and now it's all filled in, so our plan worked. Let's do it for the actual data</p>

<p>I don't know how to reassign values to a subset of a DataFrame based on a predicate (or if it's possible), so we'll get a little hacky and apply a function with a conditional. Here's what I tried originally:</p>

<p>df[(df['CS']).isnull()]['CS'] = df.apply(lambda x: x['SB'] / 2, axis=1)</p>

In [1007]:
def fill_cs(data):
    if math.isnan(data['CS']):
        return data['SB'] / 2
    else:
        return data['CS']

In [1008]:
df['CS'] = df.apply(lambda x: fill_cs(x), axis=1)

In [1009]:
df[(df['retroID'] == 'zitzmbi01')]

Unnamed: 0,retroID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
4241,zitzmbi01,1926,1,CIN,NL,53,94,21,23,2,...,3,3,1.5,6,7,0.0,2,3,0.0,
532,zitzmbi01,1919,1,PIT,NL,11,26,5,5,1,...,2,2,1.0,0,6,0.0,0,1,0.0,
3715,zitzmbi01,1925,1,CIN,NL,104,301,53,76,13,...,21,11,11.0,35,22,0.0,6,2,0.0,
4782,zitzmbi01,1927,1,CIN,NL,88,232,47,66,10,...,24,9,4.5,20,18,0.0,4,17,0.0,
5312,zitzmbi01,1928,1,CIN,NL,101,266,53,79,9,...,33,13,6.5,13,22,0.0,3,14,0.0,
533,zitzmbi01,1919,2,CIN,NL,2,1,0,0,0,...,0,0,0.0,0,0,0.0,0,0,0.0,
5842,zitzmbi01,1929,1,CIN,NL,47,84,18,19,3,...,6,4,2.0,9,10,0.0,1,2,0.0,


In [1010]:
100 * df.isnull().sum() / len(df)

retroID    0.000000
yearID     0.000000
stint      0.000000
teamID     0.000000
lgID       0.000000
G          0.000000
AB         0.000000
R          0.000000
H          0.000000
2B         0.000000
3B         0.000000
HR         0.000000
RBI        0.000000
SB         0.000000
CS         0.000000
BB         0.000000
SO         0.000000
IBB        0.000000
HBP        0.000000
SH         0.000000
SF         0.000000
GIDP       9.839985
dtype: float64

<h3>Handling missing GIDP data</h3>

In [1011]:
df[(df['GIDP'].isnull())]

Unnamed: 0,retroID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
2082,abramge01,1923,1,CIN,NL,3,1,0,1,0,...,0,0,0.0,0,0,0.0,0,0,0.0,
534,acostjo01,1920,1,WS1,AL,17,25,2,6,1,...,1,0,0.0,4,7,0.0,0,2,0.0,
1569,acostjo01,1922,1,CHA,AL,5,5,0,1,0,...,0,0,0.0,1,1,0.0,0,0,0.0,
1049,acostjo01,1921,1,WS1,AL,33,30,2,2,0,...,0,1,0.0,6,14,0.0,0,1,0.0,
6374,adairji01,1931,1,CHN,NL,18,76,9,21,3,...,3,1,0.5,1,8,0.0,0,2,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5312,zitzmbi01,1928,1,CIN,NL,101,266,53,79,9,...,33,13,6.5,13,22,0.0,3,14,0.0,
533,zitzmbi01,1919,2,CIN,NL,2,1,0,0,0,...,0,0,0.0,0,0,0.0,0,0,0.0,
5842,zitzmbi01,1929,1,CIN,NL,47,84,18,19,3,...,6,4,2.0,9,10,0.0,1,2,0.0,
9445,zuberbi01,1936,1,CLE,AL,2,5,1,1,0,...,0,0,0.0,0,1,0.0,0,0,0.0,


In [1012]:
df[(df['GIDP'].isnull())]['yearID'].max()

1938

In [1013]:
df[(df['yearID'] < 1939)].shape[0]

10502

In [1014]:
df[(df['GIDP'].isnull())].shape[0]

8683

In [1015]:
8683/10502

0.8267948962102457

<p>Over 82% of records before 1939 are missing GIDP, but it doesn't extend beyond that. I think we can once again just fill the values in with 0</p>

In [1016]:
df['GIDP'].fillna(value=0, inplace=True)

In [1017]:
100 * df.isnull().sum() / len(df)

retroID    0.0
yearID     0.0
stint      0.0
teamID     0.0
lgID       0.0
G          0.0
AB         0.0
R          0.0
H          0.0
2B         0.0
3B         0.0
HR         0.0
RBI        0.0
SB         0.0
CS         0.0
BB         0.0
SO         0.0
IBB        0.0
HBP        0.0
SH         0.0
SF         0.0
GIDP       0.0
dtype: float64

<p>We've handled all missing data in the batting database</p>

<h2>Data Integration</h2>
<p>Now we need to eliminate an columns that we don't want (if any) and convert the ones we keep to numerical values.</p>

In [1018]:
df.head()

Unnamed: 0,retroID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
79400,aardsda01,2013,1,NYN,NL,43,0,0,0,0,...,0,0,0.0,0,0,0.0,0,0,0.0,0.0
82244,aardsda01,2015,1,ATL,NL,33,1,0,0,0,...,0,0,0.0,0,1,0.0,0,0,0.0,0.0
69712,aardsda01,2006,1,CHN,NL,45,2,0,0,0,...,0,0,0.0,0,0,0.0,0,1,0.0,0.0
73859,aardsda01,2009,1,SEA,AL,73,0,0,0,0,...,0,0,0.0,0,0,0.0,0,0,0.0,0.0
71089,aardsda01,2007,1,CHA,AL,25,0,0,0,0,...,0,0,0.0,0,0,0.0,0,0,0.0,0.0


In [1019]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88242 entries, 79400 to 86706
Data columns (total 22 columns):
retroID    88242 non-null object
yearID     88242 non-null int64
stint      88242 non-null int64
teamID     88242 non-null object
lgID       88242 non-null object
G          88242 non-null int64
AB         88242 non-null int64
R          88242 non-null int64
H          88242 non-null int64
2B         88242 non-null int64
3B         88242 non-null int64
HR         88242 non-null int64
RBI        88242 non-null int64
SB         88242 non-null int64
CS         88242 non-null float64
BB         88242 non-null int64
SO         88242 non-null int64
IBB        88242 non-null float64
HBP        88242 non-null int64
SH         88242 non-null int64
SF         88242 non-null float64
GIDP       88242 non-null float64
dtypes: float64(4), int64(15), object(3)
memory usage: 15.5+ MB


<p>We will handle the metadata columns later and only worry about numerical columns for now</p>

In [1020]:
df['lgID'].value_counts()

NL    44129
AL    44113
Name: lgID, dtype: int64

In [1021]:
pd.get_dummies(df['lgID'], drop_first=True)

Unnamed: 0,NL
79400,1
82244,1
69712,1
73859,0
71089,0
...,...
20499,0
18050,0
83729,0
85212,0


<p>This one will be easy - there are only two leagues in the dataset, so we can just transform that into a single boolean column. Of course that column will be NL, the superior league.</p>

In [1022]:
df['NL'] = pd.get_dummies(df['lgID'], drop_first=True)
df.drop(columns=['lgID'], inplace=True)

In [1023]:
df

Unnamed: 0,retroID,yearID,stint,teamID,G,AB,R,H,2B,3B,...,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,NL
79400,aardsda01,2013,1,NYN,43,0,0,0,0,0,...,0,0.0,0,0,0.0,0,0,0.0,0.0,1
82244,aardsda01,2015,1,ATL,33,1,0,0,0,0,...,0,0.0,0,1,0.0,0,0,0.0,0.0,1
69712,aardsda01,2006,1,CHN,45,2,0,0,0,0,...,0,0.0,0,0,0.0,0,1,0.0,0.0,1
73859,aardsda01,2009,1,SEA,73,0,0,0,0,0,...,0,0.0,0,0,0.0,0,0,0.0,0.0,0
71089,aardsda01,2007,1,CHA,25,0,0,0,0,0,...,0,0.0,0,0,0.0,0,0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20499,zuverge01,1955,2,BAL,28,23,1,5,1,0,...,0,0.0,1,5,0.0,0,1,0.0,1.0,0
18050,zuverge01,1951,1,CLE,16,0,0,0,0,0,...,0,0.0,0,0,0.0,0,0,0.0,0.0,0
83729,zychto01,2015,1,SEA,13,0,0,0,0,0,...,0,0.0,0,0,0.0,0,0,0.0,0.0,0
85212,zychto01,2016,1,SEA,12,0,0,0,0,0,...,0,0.0,0,0,0.0,0,0,0.0,0.0,0


<p>Now we need to figure out how to handle the teamID column.</p>

In [1024]:
df['teamID'].nunique()

45

<p>Since we have more than 30 team IDs, to keep things consistent I'm just going to map them to franchise ID.</p>

In [1025]:
from team_dict import team_dict, team_nums
df['teamID'] = df['teamID'].apply(lambda x: team_dict()[x])

<p>Now let's look at what other data integration tasks we need to do</p>

In [1026]:
df['teamID'].nunique()

30

<p>We're now all set with team IDs as strings</p>

In [1027]:
df.head()

Unnamed: 0,retroID,yearID,stint,teamID,G,AB,R,H,2B,3B,...,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,NL
79400,aardsda01,2013,1,NYM,43,0,0,0,0,0,...,0,0.0,0,0,0.0,0,0,0.0,0.0,1
82244,aardsda01,2015,1,ATL,33,1,0,0,0,0,...,0,0.0,0,1,0.0,0,0,0.0,0.0,1
69712,aardsda01,2006,1,CHC,45,2,0,0,0,0,...,0,0.0,0,0,0.0,0,1,0.0,0.0,1
73859,aardsda01,2009,1,SEA,73,0,0,0,0,0,...,0,0.0,0,0,0.0,0,0,0.0,0.0,0
71089,aardsda01,2007,1,CHW,25,0,0,0,0,0,...,0,0.0,0,0,0.0,0,0,0.0,0.0,0


In [1028]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88242 entries, 79400 to 86706
Data columns (total 22 columns):
retroID    88242 non-null object
yearID     88242 non-null int64
stint      88242 non-null int64
teamID     88242 non-null object
G          88242 non-null int64
AB         88242 non-null int64
R          88242 non-null int64
H          88242 non-null int64
2B         88242 non-null int64
3B         88242 non-null int64
HR         88242 non-null int64
RBI        88242 non-null int64
SB         88242 non-null int64
CS         88242 non-null float64
BB         88242 non-null int64
SO         88242 non-null int64
IBB        88242 non-null float64
HBP        88242 non-null int64
SH         88242 non-null int64
SF         88242 non-null float64
GIDP       88242 non-null float64
NL         88242 non-null uint8
dtypes: float64(4), int64(15), object(2), uint8(1)
memory usage: 14.9+ MB


In [1029]:
df = df.sort_index()

In [1030]:
df.head()

Unnamed: 0,retroID,yearID,stint,teamID,G,AB,R,H,2B,3B,...,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,NL
0,adamsba01,1919,1,PIT,34,92,2,17,2,1,...,0,0.0,6,13,0.0,0,3,0.0,0.0,1
1,adamsbe01,1919,1,PHI,78,232,14,54,7,2,...,4,2.0,6,27,0.0,0,3,0.0,0.0,1
2,adamswi01,1919,1,OAK,1,2,0,0,0,0,...,0,0.0,0,1,0.0,0,0,0.0,0.0,0
3,agnewsa01,1919,1,MIN,42,98,6,23,7,0,...,1,0.5,10,8,0.0,1,9,0.0,0.0,0
4,ainsmed01,1919,1,DET,114,364,42,99,17,12,...,9,4.5,45,30,0.0,1,12,0.0,0.0,0


<p>We need some sort of dictionary to associate a player's retroID with an index. The following steps care of that. This is so we can later associate the correct retroID with our data.</p>

In [1031]:
df.reset_index(inplace=True)

In [1032]:
metadata_column_labels = ['index', 'yearID', 'stint', 'teamID']

In [1033]:
metadata = df[metadata_column_labels].set_index(df['retroID']).reset_index()

In [1034]:
metadata.head()

Unnamed: 0,retroID,index,yearID,stint,teamID
0,adamsba01,0,1919,1,PIT
1,adamsbe01,1,1919,1,PHI
2,adamswi01,2,1919,1,OAK
3,agnewsa01,3,1919,1,MIN
4,ainsmed01,4,1919,1,DET


<p>The metadata table will eventually be expanded with information from Players.csv to hold all relevant player information that isn't used for the neural network.</p>

In [1035]:
indexer = metadata.drop_duplicates('retroID').set_index('index').T.to_dict('retroID')[0]

In [1036]:
df = df.drop(columns=metadata_column_labels)

In [1037]:
df.head()

Unnamed: 0,retroID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,NL
0,adamsba01,34,92,2,17,2,1,0,4,0,0.0,6,13,0.0,0,3,0.0,0.0,1
1,adamsbe01,78,232,14,54,7,2,1,17,4,2.0,6,27,0.0,0,3,0.0,0.0,1
2,adamswi01,1,2,0,0,0,0,0,0,0,0.0,0,1,0.0,0,0,0.0,0.0,0
3,agnewsa01,42,98,6,23,7,0,0,10,1,0.5,10,8,0.0,1,9,0.0,0.0,0
4,ainsmed01,114,364,42,99,17,12,3,32,9,4.5,45,30,0.0,1,12,0.0,0.0,0


<p>Now that the metadata is gone, we just have the ID and the numerical batting information. We can group by the ID and just sum every other column to get player career totals.</p>

In [1038]:
df = df.groupby('retroID').sum().reset_index()

In [1039]:
df

Unnamed: 0,retroID,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,NL
0,aardsda01,331,4,0,0,0,0,0,0,0,0.0,0,2,0.0,0,1,0.0,0.0,4
1,aaronha01,3298,12364,2174,3771,624,98,755,2297,240,73.0,1402,1383,293.0,32,21,121.0,328.0,21
2,aaronto01,437,944,102,216,42,6,13,94,9,8.0,86,145,3.0,0,9,6.0,36.0,7
3,aasedo01,448,5,0,0,0,0,0,0,0,0.0,0,3,0.0,0,0,0.0,0.0,2
4,abadan01,15,21,1,2,0,0,0,0,0,1.0,4,5,0.0,0,0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15187,zupcibo01,319,795,99,199,47,4,7,80,7,5.0,57,137,3.0,6,20,8.0,15.0,0
15188,zupofr01,16,18,3,3,1,0,0,0,0,0.0,2,6,0.0,0,0,0.0,0.0,0
15189,zuvelpa01,209,491,41,109,17,2,2,20,2,0.0,34,50,1.0,2,18,0.0,8.0,4
15190,zuverge01,266,142,5,21,2,1,0,7,0,1.0,9,39,0.0,0,16,0.0,3.0,1


<p>Since we summed everything, we just need to change the NL column back. We can divide each value by itself to get either 1 or 0 like we had before.</p>

In [1046]:
df['NL'] = np.where(df['NL'] > 0, 1, 0)

In [1047]:
tensor = df.drop(columns=['retroID'])

In [1049]:
tensor

Unnamed: 0,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP,NL
0,331,4,0,0,0,0,0,0,0,0.0,0,2,0.0,0,1,0.0,0.0,1
1,3298,12364,2174,3771,624,98,755,2297,240,73.0,1402,1383,293.0,32,21,121.0,328.0,1
2,437,944,102,216,42,6,13,94,9,8.0,86,145,3.0,0,9,6.0,36.0,1
3,448,5,0,0,0,0,0,0,0,0.0,0,3,0.0,0,0,0.0,0.0,1
4,15,21,1,2,0,0,0,0,0,1.0,4,5,0.0,0,0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15187,319,795,99,199,47,4,7,80,7,5.0,57,137,3.0,6,20,8.0,15.0,0
15188,16,18,3,3,1,0,0,0,0,0.0,2,6,0.0,0,0,0.0,0.0,0
15189,209,491,41,109,17,2,2,20,2,0.0,34,50,1.0,2,18,0.0,8.0,1
15190,266,142,5,21,2,1,0,7,0,1.0,9,39,0.0,0,16,0.0,3.0,1
