This notebook is concerned with Pitchers, and runs parallel to Baseball Data Preparation notebook.

In [1]:
import numpy as np
import pandas as pd

### Win Shares Data

In [2]:
WS_df = pd.read_csv("db/WinShares.csv")
WS_df.head(10)

FileNotFoundError: File b'../db/WinShares.csv' does not exist

In [3]:
# summarize by career as well
WS_Career_df = WS_df.groupby(["playerID"],as_index=False).sum()
del WS_Career_df["yearID"]


NameError: name 'WS_df' is not defined

In [None]:
WS_Career_df = WS_Career_df.rename(
    columns={"year WS":"careerWS"})

WS_Career_df.columns

### Pitching Data  
This will be the source of our statistics.

In [None]:
pitching_df = pd.read_csv("../db/Pitching.csv")

Since we aren't concerned about per-team statistics, let's combine all multiple-stint years into one row.

In [None]:
pitching_df = pitching_df.groupby(["playerID","yearID"],as_index=False).sum()
pitching_df.head(10)

But first, one more thing.  We'll use Games to determine players' most dominent decade.

In [None]:
decades_df = pitching_df[["playerID", "yearID", "G"]]

In [None]:
x = decades_df.loc[:, "yearID"] / 10
decades_df.loc[:, "decade"] = 10 * x.round()
decades_df.head()

In [None]:
decades_df = decades_df.groupby(["playerID", "decade"], as_index=False).sum()
decades_df.head()

In [None]:
decades_df = decades_df.sort_values(["playerID","G"], ascending=[True, False]).reset_index()
decades_df.head()

Keep the first occurance of each player.  Having sorted the data in descending order of at bats, that would be the player's dominant decade.

In [None]:
decades = []
names = []
prevname = ''
for index, player in decades_df.iterrows():
    if player["playerID"] != prevname:
        names.append(player["playerID"])
        decades.append(player["decade"])
        prevname=player["playerID"]
      
decades2_dict = {"playerID": names,
                  "decade": decades}
decades2_df = pd.DataFrame(decades2_dict)
        

In [None]:
decades2_df.head(10)

Oh, and one more thing.  We need to know the last year a player was active, so that we know if they are even eligible for the Hall of Fame.

In [None]:
years_df = pitching_df[["playerID", "yearID"]]
# drop duplicates that exist because a player might play for more than one team in any given year
years_df = years_df.drop_duplicates()
lastyear_df = years_df.groupby(["playerID"], as_index=False).max(axis="yearID")
lastyear_df = lastyear_df.rename(columns={"yearID":"lastyear"})
lastyear_df.head(20)

In [None]:
# also need number of seasons
numyear_df = years_df.groupby(["playerID"], as_index=False).count()
numyear_df = numyear_df.rename(columns={"yearID":"numyears"})
numyear_df.head(15)

### Now, it's time to put it all together

In [None]:
pitching_df = pd.merge(pitching_df, WS_df, on=["playerID", "yearID"], how="left")
pitching_df = pd.merge(pitching_df, WS_Career_df, on="playerID", how="left")
pitching_df = pd.merge(pitching_df, decades2_df, on="playerID", how="left")
pitching_df = pd.merge(pitching_df, lastyear_df, on="playerID", how="left")
pitching_df = pd.merge(pitching_df, numyear_df, on="playerID", how="left")


In [None]:
pitching_df.columns

Now we start cutting back players we won't be considering.


In [None]:
# discard players from early eras
pitching2_df = pitching_df.loc[pitching_df["decade"] > 1910, :]

In [None]:
# fewer than five years would not be enough data to judge a player
pitching2_df = pitching2_df.loc[pitching_df["numyears"] > 5, :]

In [None]:
# discard low-impact players
pitching2_df = pitching2_df.loc[pitching_df["careerWS"] > 50, :]

In [None]:
pitching2_df["wsPerYr"] = pitching2_df["careerWS"] / pitching2_df["numyears"]

In [None]:
pitching2_df = pitching2_df.loc[pitching2_df["wsPerYr"] > 10, :]

In [None]:
pitching2_df.head(5)

Let's output the group of eligible players we've found so that we can make final selections.
This will be done in Excel. 

In [None]:
pitching2_df.columns

In [None]:
# drop columns which are no longer needed

del pitching2_df ["stint"]
del pitching2_df ["CG"]
del pitching2_df ["SHO"]
del pitching2_df ["SV"]  
# I think there are not enough relief pitchers in the HOF yet to create a reliable model, and they
# would need to be modelled somewhat differently than starting pitchers anyway.
del pitching2_df ["HR"]
del pitching2_df ["BAOpp"]
del pitching2_df ["IBB"]
del pitching2_df ["WP"]
del pitching2_df ["HBP"]
del pitching2_df ["BK"]
del pitching2_df ["BFP"]
del pitching2_df ["GF"]
del pitching2_df ["R"]
del pitching2_df ["SH"]
del pitching2_df ["SF"]
del pitching2_df ["GIDP"]

Recognize seasonal accomplishments by awarding points for meeting certain thresholds

In [None]:
p_wins = []
p_k    = []
p_pct  = []
p_era  = []

for index, player in pitching2_df.iterrows():
    px_wins = 0
    px_k    = 0
    px_pct  = 0
    px_era  = 0
    
    player_wins = pd.to_numeric(player["W"], errors="coerce")
    player_k    = pd.to_numeric(player["SO"], errors="coerce")
    player_era  = pd.to_numeric(player["ERA"], errors="coerce")

    
    if player_wins > 29:
        px_wins = 15
    elif player_wins > 24:
        px_wins = 10
    elif player_wins > 22:
        px_wins = 8
    elif player_wins > 19:
        px_wins = 6
    elif player_wins > 17:
        px_wins = 4
    elif player_wins > 14:
        px_wins = 2

        
    if player_k > 299:
        px_k = 6
    elif player_k > 249:
        px_k = 3
    elif player_k > 199:
        px_k = 2
        
    if player_wins  > 13:
        player_pct  = player["W"]/(player["W"]+player["L"])
        if player_pct >= .7:
            px_pct  = 2
            
    if player["IPouts"] >= 450:
        if player_era < 2:
            px_era = 4
        elif player_era < 3:
            px_era = 1
 
    p_wins.append(px_wins)
    p_k.append(px_k)
    p_pct.append(px_pct)
    p_era.append(px_era)
    
pitching2_df["pts_wins"] = p_wins
pitching2_df["pts_k"] = p_k
pitching2_df["pts_pct"] = p_pct
pitching2_df["pts_era"] = p_era


In [None]:
pitching2_df.to_csv("db/pitching2.csv", index=False, header=True)