In [1]:
import numpy as np
import pandas as pd

Fielding Data.  We'll use it to determine primary player position, and to determine number of games played at tough defensive positions.

In [None]:
fielding_df = pd.read_csv("db/Fielding.csv")
xyz = ["playerID","yearID","POS","G"]
fielding_df = fielding_df[xyz]
fielding_df.head()

In [None]:
# First, copy games played into dedicated columns per position
fielding_df["Games_P"]  = np.where(fielding_df["POS"]=="P",fielding_df["G"],0)
fielding_df["Games_C"]  = np.where(fielding_df["POS"]=="C",fielding_df["G"],0)
fielding_df["Games_1B"] = np.where(fielding_df["POS"]=="1B",fielding_df["G"],0)
fielding_df["Games_2B"] = np.where(fielding_df["POS"]=="2B",fielding_df["G"],0)
fielding_df["Games_3B"] = np.where(fielding_df["POS"]=="3B",fielding_df["G"],0)
fielding_df["Games_SS"] = np.where(fielding_df["POS"]=="SS",fielding_df["G"],0)
fielding_df["Games_OF"] = np.where(fielding_df["POS"]=="OF",fielding_df["G"],0)
fielding_df.head()

sum them up per player per year, which will be easier to handle when we eventually find career statistics

In [None]:
posYrG_df = fielding_df.groupby(["playerID","yearID"], as_index=False).sum()
posYrG_df.head(20)

Drop fielding games, so that it's not confused with an existing batting games field

In [None]:
del posYrG_df["G"]
posYrG_df.head()

Now, sum them up per player ...

In [None]:
fielding2_df = fielding_df.groupby(["playerID"], as_index=False).sum()


In [None]:
fielding2_df.head(30)

... and then find out which is the highest.

In [None]:
positions = []
names = []
for index, player in fielding2_df.iterrows():
    test_games = player["Games_P"]
    posit = "P"
    if player["Games_C"] > test_games:
        posit = "C"
        test_games = player["Games_C"]
    if player["Games_1B"] > test_games:
        posit = "1B"
        test_games = player["Games_1B"]
    if player["Games_2B"] > test_games:
        posit = "2B"
        test_games = player["Games_2B"]
    if player["Games_3B"] > test_games:
        posit = "3B"
        test_games = player["Games_3B"]
    if player["Games_SS"] > test_games:
        posit = "SS"
        test_games = player["Games_SS"]
    if player["Games_OF"] > test_games:
        posit = "OF"
        test_games = player["Games_OF"]
    positions.append(posit)
    names.append(player["playerID"])
    
positions_dict = {"playerID": names,
                  "Pos": positions
                 }
positions_df = pd.DataFrame(positions_dict)
        
positions_df.head(10)

Win Shares Data.  Pretty much already as we need it.

In [None]:
WS_df = pd.read_csv("db/WinShares.csv")
WS_df.head(10)

Batting Data.  This will be the source of our statistics.

In [None]:
batting_df = pd.read_csv("db/Batting.csv")

Since we aren't concerned about per-team statistics, let's combine all multiple-stint years into one row.

In [None]:
batting_df = batting_df.groupby(["playerID","yearID"],as_index=False).sum()
batting_df.head(10)

But first, one more thing.  We'll use At Bats to determine players' most dominent decade.

In [None]:
decades_df = batting_df[["playerID", "yearID", "AB"]]
decades_df

In [None]:
x = decades_df["yearID"] / 10
decades_df["decade"] = 10 * x.round()
decades_df.head()

In [None]:
decades_df = decades_df.groupby(["playerID", "decade"], as_index=False).sum()
decades_df.head()

In [None]:
decades_df = decades_df.sort_values(["playerID","AB"], ascending=[True, False]).reset_index()
decades_df.head()

Keep the first occurance of each player.  Having sorted the data in descending order of at bats, that would be the player's dominant decade.

In [None]:
decades = []
names = []
prevname = ''
for index, player in decades_df.iterrows():
    if player["playerID"] != prevname:
        names.append(player["playerID"])
        decades.append(player["decade"])
        prevname=player["playerID"]
      
decades2_dict = {"playerID": names,
                  "decade": decades}
decades2_df = pd.DataFrame(decades2_dict)
        

In [None]:
decades2_df.head(10)

Oh, and one more thing.  We need to know the last year a player was active, so that we know if they are even eligible for the Hall of Fame.

In [None]:
years_df = batting_df[["playerID", "yearID"]]
# drop duplicates that exist because a player might play for more than one team in any given year
years_df = years_df.drop_duplicates()
lastyear_df = years_df.groupby(["playerID"], as_index=False).max(axis="yearID")
lastyear_df = lastyear_df.rename(columns={"yearID":"lastyear"})
lastyear_df.head(20)

In [None]:
# also need number of seasons
numyear_df = years_df.groupby(["playerID"], as_index=False).count()
numyear_df = numyear_df.rename(columns={"yearID":"numyears"})
numyear_df.head(15)

In [None]:
posYrG_df.columns

### Now, it's time to put it all together

In [None]:
batting_df = pd.merge(batting_df, positions_df, on="playerID", how="left")
batting_df = pd.merge(batting_df, WS_df, on="playerID", how="left")
batting_df = pd.merge(batting_df, decades2_df, on="playerID", how="left")
batting_df = pd.merge(batting_df, lastyear_df, on="playerID", how="left")
batting_df = pd.merge(batting_df, numyear_df, on="playerID", how="left")
batting_df = pd.merge(batting_df, posYrG_df, on=["playerID", "yearID"], how="left")

In [None]:
batting_df.columns

Now we start cutting back players we won't be considering.

First, discard pitchers, who would need to be considered under different criteria and whose batting statistics would be irrelevant in any case.

In [None]:
batting2_df = batting_df.loc[batting_df["Pos"] != "P", :]

In [None]:
batting2_df = batting2_df.loc[batting_df["decade"] > 1910, :]

In [None]:
batting2_df = batting2_df.loc[batting_df["numyears"] > 9, :]

In [None]:
batting2_df = batting2_df.loc[batting_df["CareerWS"] > 75, :]

In [None]:
batting2_df["wsPerYr"] = batting2_df["CareerWS"] / batting2_df["numyears"]

In [None]:
batting2_df = batting2_df.loc[batting2_df["wsPerYr"] > 10, :]

In [None]:
batting2_df.head(5)

Let's output the group of eligible players we've found so that we can make final selections.
This will be done in Excel so that we can ensure that we have balance over decades and positions.

In [None]:
# drop columns which are no longer needed

del batting2_df ["Games_P"]
del batting2_df ["Games_OF"]
del batting2_df ["Games_1B"]
del batting2_df ["Pos"]
del batting2_df ["CareerWS"]
del batting2_df ["decade"]
del batting2_df ["numyears"]
del batting2_df ["wsPerYr"]
del batting2_df ["stint"]
del batting2_df ["CS"]
del batting2_df ["SO"]
del batting2_df ["IBB"]
del batting2_df ["GIDP"]



In [None]:
batting2_df["PA"] = batting2_df["AB"] + batting2_df["BB"] + batting2_df["HBP"] + batting2_df["SH"] + batting2_df["SF"]

In [None]:
# 495 plate appearances to qualify for computed statistics
batting2_df["AVG"]  = np.where(batting2_df["PA"] > 495,batting2_df["H"]/batting2_df["AB"],0)
batting2_df["SLG"]  = np.where(batting2_df["PA"] > 495,
                              (batting2_df["H"]+batting2_df["2B"]+2*batting2_df["3B"]+3*batting2_df["HR"])/batting2_df["AB"],0)

In [None]:
batting2_df.to_csv("db/batting2.csv", index=False, header=True)