This notebook is concerned with Pitchers, and runs parallel to Baseball Data Preparation notebook.

In [1]:
import numpy as np
import pandas as pd

### Win Shares Data.  
Pretty much already as we need it.

In [2]:
WS_df = pd.read_csv("db/WinShares.csv")
WS_df.head(10)

Unnamed: 0,playerID,CareerWS
0,ruthba01,756
1,cobbty01,722
2,bondsba01,699
3,wagneho01,655
4,aaronha01,643
5,mayswi01,642
6,youngcy01,634
7,speaktr01,630
8,musiast01,604
9,collied01,574


### Pitching Data  
This will be the source of our statistics.

In [3]:
pitching_df = pd.read_csv("db/Pitching.csv")

Since we aren't concerned about per-team statistics, let's combine all multiple-stint years into one row.

In [4]:
pitching_df = pitching_df.groupby(["playerID","yearID"],as_index=False).sum()
pitching_df.head(10)

Unnamed: 0,playerID,yearID,stint,W,L,G,GS,CG,SHO,SV,...,IBB,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP
0,aardsda01,2004,1,1,0,11,0,0,0,0,...,0.0,0,2.0,0,61.0,5,8,0.0,1.0,1.0
1,aardsda01,2006,1,3,0,45,0,0,0,0,...,0.0,1,1.0,0,225.0,9,25,1.0,3.0,2.0
2,aardsda01,2007,1,2,1,25,0,0,0,0,...,3.0,2,1.0,0,151.0,7,24,2.0,1.0,1.0
3,aardsda01,2008,1,4,2,47,0,0,0,0,...,2.0,3,5.0,0,228.0,7,32,3.0,2.0,4.0
4,aardsda01,2009,1,3,6,73,0,0,0,38,...,3.0,2,0.0,0,296.0,53,23,2.0,1.0,2.0
5,aardsda01,2010,1,0,6,53,0,0,0,31,...,5.0,2,2.0,0,202.0,43,19,7.0,1.0,5.0
6,aardsda01,2012,1,0,0,1,0,0,0,0,...,0.0,0,0.0,0,5.0,1,1,0.0,0.0,0.0
7,aardsda01,2013,1,2,2,43,0,0,0,0,...,6.0,1,4.0,1,178.0,7,20,2.0,1.0,2.0
8,aardsda01,2015,1,1,1,33,0,0,0,0,...,3.0,1,1.0,0,129.0,9,17,0.0,1.0,4.0
9,aasedo01,1977,1,6,2,13,13,4,2,0,...,1.0,0,1.0,0,373.0,0,36,2.0,3.0,7.0


But first, one more thing.  We'll use Games to determine players' most dominent decade.

In [5]:
decades_df = pitching_df[["playerID", "yearID", "G"]]

In [6]:
x = decades_df.loc[:, "yearID"] / 10
decades_df.loc[:, "decade"] = 10 * x.round()
decades_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,playerID,yearID,G,decade
0,aardsda01,2004,11,2000.0
1,aardsda01,2006,45,2010.0
2,aardsda01,2007,25,2010.0
3,aardsda01,2008,47,2010.0
4,aardsda01,2009,73,2010.0


In [7]:
decades_df = decades_df.groupby(["playerID", "decade"], as_index=False).sum()
decades_df.head()

Unnamed: 0,playerID,decade,yearID,G
0,aardsda01,2000.0,2004,11
1,aardsda01,2010.0,14065,287
2,aardsda01,2020.0,2015,33
3,aasedo01,1980.0,15846,259
4,aasedo01,1990.0,9940,189


In [8]:
decades_df = decades_df.sort_values(["playerID","G"], ascending=[True, False]).reset_index()
decades_df.head()

Unnamed: 0,index,playerID,decade,yearID,G
0,1,aardsda01,2010.0,14065,287
1,2,aardsda01,2020.0,2015,33
2,0,aardsda01,2000.0,2004,11
3,3,aasedo01,1980.0,15846,259
4,4,aasedo01,1990.0,9940,189


Keep the first occurance of each player.  Having sorted the data in descending order of at bats, that would be the player's dominant decade.

In [9]:
decades = []
names = []
prevname = ''
for index, player in decades_df.iterrows():
    if player["playerID"] != prevname:
        names.append(player["playerID"])
        decades.append(player["decade"])
        prevname=player["playerID"]
      
decades2_dict = {"playerID": names,
                  "decade": decades}
decades2_df = pd.DataFrame(decades2_dict)
        

In [10]:
decades2_df.head(10)

Unnamed: 0,playerID,decade
0,aardsda01,2010.0
1,aasedo01,1980.0
2,abadfe01,2010.0
3,abbeybe01,1890.0
4,abbeych01,1900.0
5,abbotda01,1890.0
6,abbotgl01,1980.0
7,abbotji01,1990.0
8,abbotky01,1990.0
9,abbotpa01,2000.0


Oh, and one more thing.  We need to know the last year a player was active, so that we know if they are even eligible for the Hall of Fame.

In [11]:
years_df = pitching_df[["playerID", "yearID"]]
# drop duplicates that exist because a player might play for more than one team in any given year
years_df = years_df.drop_duplicates()
lastyear_df = years_df.groupby(["playerID"], as_index=False).max(axis="yearID")
lastyear_df = lastyear_df.rename(columns={"yearID":"lastyear"})
lastyear_df.head(20)

Unnamed: 0,playerID,lastyear
0,aardsda01,2015
1,aasedo01,1990
2,abadfe01,2017
3,abbeybe01,1896
4,abbeych01,1896
5,abbotda01,1890
6,abbotgl01,1984
7,abbotji01,1999
8,abbotky01,1996
9,abbotpa01,2004


In [12]:
# also need number of seasons
numyear_df = years_df.groupby(["playerID"], as_index=False).count()
numyear_df = numyear_df.rename(columns={"yearID":"numyears"})
numyear_df.head(15)

Unnamed: 0,playerID,numyears
0,aardsda01,9
1,aasedo01,13
2,abadfe01,8
3,abbeybe01,5
4,abbeych01,1
5,abbotda01,1
6,abbotgl01,11
7,abbotji01,10
8,abbotky01,4
9,abbotpa01,11


### Now, it's time to put it all together

In [13]:
pitching_df = pd.merge(pitching_df, WS_df, on="playerID", how="left")
pitching_df = pd.merge(pitching_df, decades2_df, on="playerID", how="left")
pitching_df = pd.merge(pitching_df, lastyear_df, on="playerID", how="left")
pitching_df = pd.merge(pitching_df, numyear_df, on="playerID", how="left")


In [14]:
pitching_df.columns

Index(['playerID', 'yearID', 'stint', 'W', 'L', 'G', 'GS', 'CG', 'SHO', 'SV',
       'IPouts', 'H', 'ER', 'HR', 'BB', 'SO', 'BAOpp', 'ERA', 'IBB', 'WP',
       'HBP', 'BK', 'BFP', 'GF', 'R', 'SH', 'SF', 'GIDP', 'CareerWS', 'decade',
       'lastyear', 'numyears'],
      dtype='object')

Now we start cutting back players we won't be considering.


In [27]:
# discard players from early eras
pitching2_df = pitching_df.loc[pitching_df["decade"] > 1910, :]

In [28]:
# fewer than ten years would not be enough data to judge a player
# pitching2_df = pitching2_df.loc[pitching_df["numyears"] > 9, :]

In [29]:
# discard low-impact players
pitching2_df = pitching2_df.loc[pitching_df["CareerWS"] > 75, :]

In [30]:
pitching2_df["wsPerYr"] = pitching2_df["CareerWS"] / pitching2_df["numyears"]

In [31]:
pitching2_df = pitching2_df.loc[pitching2_df["wsPerYr"] > 10, :]

In [32]:
pitching2_df.head(5)

Unnamed: 0,playerID,yearID,stint,W,L,G,GS,CG,SHO,SV,...,GF,R,SH,SF,GIDP,CareerWS,decade,lastyear,numyears,wsPerYr
187,adamsba01,1906,1,0,1,1,1,0,0,0,...,0,8,0.0,0.0,0.0,243.0,1920.0,1926,19,12.789474
188,adamsba01,1907,1,0,2,4,3,1,0,0,...,1,25,0.0,0.0,0.0,243.0,1920.0,1926,19,12.789474
189,adamsba01,1909,1,12,3,25,12,7,3,2,...,11,25,0.0,0.0,0.0,243.0,1920.0,1926,19,12.789474
190,adamsba01,1910,1,18,9,34,30,16,3,0,...,3,95,0.0,0.0,0.0,243.0,1920.0,1926,19,12.789474
191,adamsba01,1911,1,22,12,40,37,24,6,0,...,2,97,0.0,0.0,0.0,243.0,1920.0,1926,19,12.789474


Let's output the group of eligible players we've found so that we can make final selections.
This will be done in Excel so that we can ensure that we have balance over decades and positions.

In [33]:
pitching2_df.columns

Index(['playerID', 'yearID', 'stint', 'W', 'L', 'G', 'GS', 'CG', 'SHO', 'SV',
       'IPouts', 'H', 'ER', 'HR', 'BB', 'SO', 'BAOpp', 'ERA', 'IBB', 'WP',
       'HBP', 'BK', 'BFP', 'GF', 'R', 'SH', 'SF', 'GIDP', 'CareerWS', 'decade',
       'lastyear', 'numyears', 'wsPerYr'],
      dtype='object')

In [34]:
# drop columns which are no longer needed

del pitching2_df ["stint"]
del pitching2_df ["CG"]
del pitching2_df ["SHO"]
del pitching2_df ["SV"]  
# I think there are not enough relief pitchers in the HOF yet to create a reliable model, and they
# would need to be modelled somewhat differently than starting pitchers anyway.
del pitching2_df ["HR"]
del pitching2_df ["BAOpp"]
del pitching2_df ["IBB"]
del pitching2_df ["WP"]
del pitching2_df ["HBP"]
del pitching2_df ["BK"]
del pitching2_df ["BFP"]
del pitching2_df ["GF"]
del pitching2_df ["R"]
del pitching2_df ["SH"]
del pitching2_df ["SF"]
del pitching2_df ["GIDP"]

In [35]:
pitching2_df.to_csv("db/pitching2.csv", index=False, header=True)