## Part II: Predicting Hall of Fame Careers

In [113]:
# Connecting to SQLite Database
import pandas as pd
import sqlite3
conn = sqlite3.connect("lahman2016.sqlite")

In [114]:
queryMaster = 'select playerID,nameFirst,nameLast,bats,throws,debut,finalGame from Master;'
queryPitching = 'select * from Pitching;'
queryFielding = 'select * from Fielding;'
queryAwards = 'select playerID,awardID,yearID from AwardsPlayers;'
queryAllStar = 'select playerID, YearID from AllstarFull;'
queryHOF = 'select playerID,yearid from HallofFame where inducted == "Y" and category == "Player";'

In [115]:
batting_df = pd.read_csv('Batting.csv')

In [116]:
master = conn.execute(queryMaster).fetchall()
master_df = pd.DataFrame(master)
pitching = conn.execute(queryPitching).fetchall()
pitching_df = pd.DataFrame(pitching)
fielding = conn.execute(queryFielding).fetchall()
fielding_df = pd.DataFrame(fielding)
awards = conn.execute(queryAwards).fetchall()
awards_df = pd.DataFrame(awards)
allstar = conn.execute(queryAllStar).fetchall()
allstar_df = pd.DataFrame(allstar)
hof = conn.execute(queryHOF).fetchall()
hof_df = pd.DataFrame(hof)

In [117]:
master_cols = ['playerID','nameFirst','nameLast','bats','throws','debut','finalGame']
master_df.columns = master_cols

pitching_cols = ['playerID','yearID','stint','teamID','lgID','W','L','G','GS','CG','SHO',
'SV','IPOuts','H','ER','HR','BB','SO','BAOpp','ERA','IBB','WP','HBP','BK','BFP','GF','R','SH','SF','GIDP']
pitching_df.columns = pitching_cols

fielding_cols = ['playerID','yearID','stint','teamID','lgID','Pos','G','GS','InnOuts','PO','A','E','DP','PB','WP','SB','CS','ZR']
fielding_df.columns = fielding_cols

awards_cols = ['playerID','awardID','yearID']
awards_df.columns = awards_cols

allstar_cols = ['playerID','YearID']
allstar_df.columns = allstar_cols

hof_cols = ['playerID','yearid']
hof_df.columns = hof_cols

In [118]:
print(batting_df.head())

    playerID  yearID  stint teamID lgID   G   AB   R   H  2B  ...    RBI   SB  \
0  abercda01    1871      1    TRO  NaN   1    4   0   0   0  ...    0.0  0.0   
1   addybo01    1871      1    RC1  NaN  25  118  30  32   6  ...   13.0  8.0   
2  allisar01    1871      1    CL1  NaN  29  137  28  40   4  ...   19.0  3.0   
3  allisdo01    1871      1    WS3  NaN  27  133  28  44  10  ...   27.0  1.0   
4  ansonca01    1871      1    RC1  NaN  25  120  29  39  11  ...   16.0  6.0   

    CS  BB   SO  IBB  HBP  SH  SF  GIDP  
0  0.0   0  0.0  NaN  NaN NaN NaN   NaN  
1  1.0   4  0.0  NaN  NaN NaN NaN   NaN  
2  1.0   2  5.0  NaN  NaN NaN NaN   NaN  
3  1.0   0  2.0  NaN  NaN NaN NaN   NaN  
4  2.0   2  1.0  NaN  NaN NaN NaN   NaN  

[5 rows x 22 columns]


In [119]:
player_stats = {}
for i, row in batting_df.iterrows():
    playerID = row['playerID']
    G = row['G']
    AB = row['AB']
    R = row['R']
    H = row['H']
    twoB = row['2B']
    threeB = row['3B']
    HR = row['HR']
    RBI = row['RBI']
    SB = row['SB']
    CS = row['CS']
    BB = row['BB']
    SO = row['SO']
    IBB = row['IBB']
    HBP = row['HBP']
    SH = row['SH']
    SF = row['SF']
    GIDP = row['GIDP']
    if playerID in player_stats:
        player_stats[playerID]['G'] = player_stats[playerID]['G'] + G
        player_stats[playerID]['AB'] = player_stats[playerID]['AB'] + AB
        player_stats[playerID]['R'] = player_stats[playerID]['R'] + R
        player_stats[playerID]['H'] = player_stats[playerID]['H'] + H
        player_stats[playerID]['2B'] = player_stats[playerID]['2B'] + twoB
        player_stats[playerID]['3B'] = player_stats[playerID]['3B'] + threeB
        player_stats[playerID]['HR'] = player_stats[playerID]['HR'] + HR
        player_stats[playerID]['RBI'] = player_stats[playerID]['RBI'] + RBI
        player_stats[playerID]['SB'] = player_stats[playerID]['SB'] + SB
        player_stats[playerID]['CS'] = player_stats[playerID]['CS'] + CS
        player_stats[playerID]['BB'] = player_stats[playerID]['BB'] + BB
        player_stats[playerID]['SO'] = player_stats[playerID]['SO'] + SO
        player_stats[playerID]['IBB'] = player_stats[playerID]['IBB'] + IBB
        player_stats[playerID]['HBP'] = player_stats[playerID]['HBP'] + HBP
        player_stats[playerID]['SH'] = player_stats[playerID]['SH'] + SH
        player_stats[playerID]['GIDP'] = player_stats[playerID]['GIDP'] + GIDP
    else:
        player_stats[playerID] = {}
        player_stats[playerID]['G'] = G
        player_stats[playerID]['AB'] = AB
        player_stats[playerID]['R'] = R
        player_stats[playerID]['H'] = H
        player_stats[playerID]['2B'] = twoB
        player_stats[playerID]['3B'] = threeB
        player_stats[playerID]['HR'] = HR
        player_stats[playerID]['RBI'] = RBI
        player_stats[playerID]['SB'] = SB
        player_stats[playerID]['CS'] = CS
        player_stats[playerID]['BB'] = BB
        player_stats[playerID]['SO'] = SO
        player_stats[playerID]['IBB'] = IBB
        player_stats[playerID]['HBP'] = HBP
        player_stats[playerID]['SH'] = SH
        player_stats[playerID]['GIDP'] = GIDP        


In [120]:
pitcher_list = []
for i, row in pitching_df.iterrows():
    playerID = row['playerID']
    Wp = row['W']
    Lp = row['L']
    Gp = row['G']
    GSp = row['GS']
    CGp = row['CG']
    SHOp = row['SHO']
    SVp = row['SV']
    IPOutsp = row['IPOuts']
    Hp = row['H']
    ERp = row['ER']
    HRp = row['HR']
    BBp = row['BB']
    SOp = row['SO']
    IBBp = row['IBB']
    WPp = row['WP']
    HBPp = row['HBP']
    BKp = row['BK']
    BFPp = row['BFP']
    GFp = row['GF']
    Rp = row['R']
    SHp = row['SH']
    SFp = row['SF']
    GIDPp = row['GIDP']
    if playerID in player_stats and playerID in pitcher_list:
        player_stats[playerID]['Wp'] = player_stats[playerID]['Wp'] + Wp
        player_stats[playerID]['Lp'] = player_stats[playerID]['Lp'] + Lp
        player_stats[playerID]['Gp'] = player_stats[playerID]['Gp'] + Gp
        player_stats[playerID]['GSp'] = player_stats[playerID]['GSp'] + GSp
        player_stats[playerID]['CGp'] = player_stats[playerID]['CGp'] + CGp
        player_stats[playerID]['SHOp'] = player_stats[playerID]['SHOp'] + SHOp
        player_stats[playerID]['SVp'] = player_stats[playerID]['SVp'] + SVp
        player_stats[playerID]['IPOutsp'] = player_stats[playerID]['IPOutsp'] + IPOutsp
        player_stats[playerID]['Hp'] = player_stats[playerID]['Hp'] + Hp
        player_stats[playerID]['ERp'] = player_stats[playerID]['ERp'] + ERp
        player_stats[playerID]['HRp'] = player_stats[playerID]['HRp'] + HRp
        player_stats[playerID]['BBp'] = player_stats[playerID]['BBp'] + BBp
        player_stats[playerID]['SOp'] = player_stats[playerID]['SOp'] + SOp
        player_stats[playerID]['IBBp'] = player_stats[playerID]['IBBp'] + IBBp
        player_stats[playerID]['WPp'] = player_stats[playerID]['WPp'] + WPp
        player_stats[playerID]['HBPp'] = player_stats[playerID]['HBPp'] + HBPp
        player_stats[playerID]['BKp'] = player_stats[playerID]['BKp'] + BKp
        player_stats[playerID]['BFPp'] = player_stats[playerID]['BFPp'] + BFPp
        player_stats[playerID]['GFp'] = player_stats[playerID]['GFp'] + GFp
        player_stats[playerID]['Rp'] = player_stats[playerID]['Rp'] + Rp
        player_stats[playerID]['SHp'] = player_stats[playerID]['SHp'] + SHp
        player_stats[playerID]['SFp'] = player_stats[playerID]['SFp'] + SFp
        player_stats[playerID]['GIDPp'] = player_stats[playerID]['GIDPp'] + GIDPp
    else:
        pitcher_list.append(playerID)
        player_stats[playerID]['Wp'] = Wp
        player_stats[playerID]['Lp'] = Lp
        player_stats[playerID]['Gp'] = Gp
        player_stats[playerID]['GSp'] = GSp
        player_stats[playerID]['CGp'] = CGp
        player_stats[playerID]['SHOp'] = SHOp
        player_stats[playerID]['SVp'] = SVp
        player_stats[playerID]['IPOutsp'] = IPOutsp
        player_stats[playerID]['Hp'] = Hp
        player_stats[playerID]['ERp'] = ERp
        player_stats[playerID]['HRp'] = HRp
        player_stats[playerID]['BBp'] = BBp
        player_stats[playerID]['SOp'] = SOp
        player_stats[playerID]['IBBp'] = IBBp
        player_stats[playerID]['WPp'] = WPp
        player_stats[playerID]['HBPp'] = HBPp
        player_stats[playerID]['BKp'] = BKp
        player_stats[playerID]['BFPp'] = BFPp
        player_stats[playerID]['GFp'] = GFp
        player_stats[playerID]['Rp'] = Rp
        player_stats[playerID]['SHp'] = SHp
        player_stats[playerID]['SFp'] = SFp
        player_stats[playerID]['GIDPp'] = GIDPp

        

In [121]:
fielder_list = []
for i, row in fielding_df.iterrows():
    playerID = row['playerID']
    Gf = row['G']
    GSf = row['GS']
    InnOutsf = row['InnOuts']
    POf = row['PO']
    Af = row['A']
    Ef = row['E']
    DPf = row['DP']
    PBfc = row['PB']
    WPfc = row['WP']
    SBfc = row['SB']
    CSfc = row['CS']
    if playerID in player_stats and playerID in fielder_list:
        player_stats[playerID]['Gf'] = player_stats[playerID]['Gf'] + Gf
        player_stats[playerID]['GSf'] = player_stats[playerID]['GSf'] + GSf
        player_stats[playerID]['InnOutsf'] = player_stats[playerID]['InnOutsf'] + InnOutsf
        player_stats[playerID]['POf'] = player_stats[playerID]['POf'] + POf
        player_stats[playerID]['Af'] = player_stats[playerID]['Af'] + Af
        player_stats[playerID]['Ef'] = player_stats[playerID]['Ef'] + Ef
        player_stats[playerID]['DPf'] = player_stats[playerID]['DPf'] + DPf
        player_stats[playerID]['PBfc'] = player_stats[playerID]['PBfc'] + PBfc
        player_stats[playerID]['WPfc'] = player_stats[playerID]['WPfc'] + WPfc
        player_stats[playerID]['SBfc'] = player_stats[playerID]['SBfc'] + SBfc
        player_stats[playerID]['CSfc'] = player_stats[playerID]['CSfc'] + CSfc
    else:
        fielder_list.append(playerID)
        player_stats[playerID]['Gf'] = Gf
        player_stats[playerID]['GSf'] = GSf
        player_stats[playerID]['InnOutsf'] = InnOutsf
        player_stats[playerID]['POf'] = POf
        player_stats[playerID]['Af'] = Af
        player_stats[playerID]['Ef'] = Ef
        player_stats[playerID]['DPf'] = DPf
        player_stats[playerID]['PBfc'] = PBfc
        player_stats[playerID]['WPfc'] = WPfc
        player_stats[playerID]['SBfc'] = SBfc
        player_stats[playerID]['CSfc'] = CSfc

In [122]:
print(awards_df['awardID'].unique())

['Pitching Triple Crown' 'Triple Crown' 'Baseball Magazine All-Star'
 'Most Valuable Player' 'TSN All-Star' 'TSN Guide MVP'
 'TSN Major League Player of the Year' 'TSN Pitcher of the Year'
 'TSN Player of the Year' 'Rookie of the Year' 'Babe Ruth Award'
 'Lou Gehrig Memorial Award' 'World Series MVP' 'Cy Young Award'
 'Gold Glove' 'TSN Fireman of the Year' 'All-Star Game MVP' 'Hutch Award'
 'Roberto Clemente Award' 'Rolaids Relief Man Award' 'NLCS MVP' 'ALCS MVP'
 'Silver Slugger' 'Branch Rickey Award' 'Hank Aaron Award'
 'TSN Reliever of the Year' 'Comeback Player of the Year'
 'Outstanding DH Award' 'Reliever of the Year Award']


In [123]:
mvp_df = awards_df[awards_df['awardID'] == 'Most Valuable Player']
roy_df = awards_df[awards_df['awardID'] == 'Rookie of the Year']
cy_df = awards_df[awards_df['awardID'] == 'Cy Young Award']
gg_df = awards_df[awards_df['awardID'] == 'Gold Glove']
ss_df = awards_df[awards_df['awardID'] == 'Silver Slugger']

awards_list = [mvp_df,roy_df,cy_df,gg_df,ss_df]

In [124]:
print(awards_list[1].head())

       playerID             awardID  yearID
1796  robinja02  Rookie of the Year    1947
1845   darkal01  Rookie of the Year    1948
1899  sievero01  Rookie of the Year    1949
1900  newcodo01  Rookie of the Year    1949
1953  dropowa01  Rookie of the Year    1950


In [125]:
mvp_list = []
roy_list = []
cy_list = []
gg_list = []
ss_list = []
lists = [mvp_list,roy_list,cy_list,gg_list,ss_list]

In [126]:
for index, v in enumerate(awards_list):
    for i, row in v.iterrows():
        playerID = row['playerID']
        award = row['awardID']
        if playerID in player_stats and playerID in lists[index]:
            player_stats[playerID][award] += 1
        else:
            lists[index].append(playerID)
            player_stats[playerID][award] = 1

In [127]:
allstar_list = []
for i, row in allstar_df.iterrows():
    playerID = row['playerID']
    if playerID in player_stats and playerID in allstar_list:
        player_stats[playerID]['AS_games'] += 1
    else:
        allstar_list.append(playerID)
        player_stats[playerID]['AS_games'] = 1

In [128]:
for i, row in hof_df.iterrows():
    playerID = row['playerID']
    if playerID in player_stats:
        player_stats[playerID]['HoF'] = 1

In [129]:
print(len(hof_df))
print(len(master_df))

250
19105


In [130]:
print(dict(list(player_stats.items())[0:2]))

{'butlear01': {'3B': 13, 'InnOutsf': nan, 'SBfc': nan, 'SH': 40.0, 'CSfc': nan, 'CS': nan, 'HR': 3, 'POf': 630, '2B': 44, 'WPfc': nan, 'Af': 864.0, 'Gf': 356, 'SO': 102.0, 'BB': 146, 'GIDP': nan, 'HBP': 10.0, 'G': 454, 'SB': 54.0, 'H': 311, 'GSf': nan, 'AB': 1289, 'PBfc': nan, 'Ef': 126.0, 'DPf': 97.0, 'R': 181, 'RBI': 101.0, 'IBB': nan}, 'kented01': {'3B': 0, 'Wp': 0, 'SFp': nan, 'CSfc': nan, 'CS': nan, 'GFp': 0.0, 'Gp': 1, 'H': 0, 'WPfc': nan, 'WPp': 0.0, 'HBPp': 1.0, 'SHp': nan, 'Gf': 1, 'BB': 0, 'CGp': 1, 'SOp': 4, 'Lp': 1, 'HBP': 0.0, 'G': 1, 'HRp': 0, 'GSf': nan, 'SHOp': 0, 'SVp': 0, 'R': 0, 'IBB': nan, 'InnOutsf': nan, 'SBfc': nan, 'SH': nan, 'IPOutsp': 27, 'HR': 0, 'POf': 1, '2B': 0, 'BFPp': 51.0, 'Af': 4.0, 'GIDPp': nan, 'GIDP': nan, 'Hp': 14, 'Rp': 11, 'IBBp': nan, 'BKp': 0, 'SB': 0.0, 'ERp': 6, 'BBp': 3, 'AB': 4, 'PBfc': nan, 'Ef': 1.0, 'DPf': 0.0, 'GSp': 1, 'RBI': 0.0, 'SO': nan}}


In [131]:
stats_df = pd.DataFrame.from_dict(player_stats, orient='index')

In [132]:
stats_df['playerID'] = stats_df.index

In [133]:
print(stats_df.head())

           3B  InnOutsf  SBfc    SH  CSfc    CS   HR     POf   2B  WPfc  \
aardsda01   0    1011.0   NaN   1.0   NaN   0.0    0    11.0    0   NaN   
aaronha01  98   78413.0   NaN  21.0   NaN  73.0  755  7436.0  624   NaN   
aaronto01   6    6472.0   NaN   9.0   NaN   8.0   13  1317.0   42   NaN   
aasedo01    0    3328.0   NaN   0.0   NaN   0.0    0    67.0    0   NaN   
abadan01    0     138.0   NaN   0.0   NaN   1.0    0    37.0    0   NaN   

             ...        BBp   GSp  AS_games  Gold Glove  Silver Slugger  HoF  \
aardsda01    ...      183.0   0.0       NaN         NaN             NaN  NaN   
aaronha01    ...        NaN   NaN      25.0         3.0             NaN  1.0   
aaronto01    ...        NaN   NaN       NaN         NaN             NaN  NaN   
aasedo01     ...      457.0  91.0       1.0         NaN             NaN  NaN   
abadan01     ...        NaN   NaN       NaN         NaN             NaN  NaN   

           Most Valuable Player  Cy Young Award  Rookie of the Year 

In [134]:
print(master_df.head())

    playerID nameFirst nameLast bats throws       debut   finalGame
0  aardsda01     David  Aardsma    R      R  2004-04-06  2015-08-23
1  aaronha01      Hank    Aaron    R      R  1954-04-13  1976-10-03
2  aaronto01    Tommie    Aaron    R      R  1962-04-10  1971-09-26
3   aasedo01       Don     Aase    R      R  1977-07-26  1990-10-03
4   abadan01      Andy     Abad    L      L  2001-09-10  2006-04-13


In [135]:
df = master_df.join(stats_df,on='playerID',how='inner',rsuffix='mstr')

In [136]:
print(df.head())
print(len(stats_df))
print(len(df))
print(len(master_df))

    playerID nameFirst nameLast bats throws       debut   finalGame  3B  \
0  aardsda01     David  Aardsma    R      R  2004-04-06  2015-08-23   0   
1  aaronha01      Hank    Aaron    R      R  1954-04-13  1976-10-03  98   
2  aaronto01    Tommie    Aaron    R      R  1962-04-10  1971-09-26   6   
3   aasedo01       Don     Aase    R      R  1977-07-26  1990-10-03   0   
4   abadan01      Andy     Abad    L      L  2001-09-10  2006-04-13   0   

   InnOutsf  SBfc      ...         BBp   GSp  AS_games  Gold Glove  \
0    1011.0   NaN      ...       183.0   0.0       NaN         NaN   
1   78413.0   NaN      ...         NaN   NaN      25.0         3.0   
2    6472.0   NaN      ...         NaN   NaN       NaN         NaN   
3    3328.0   NaN      ...       457.0  91.0       1.0         NaN   
4     138.0   NaN      ...         NaN   NaN       NaN         NaN   

   Silver Slugger  HoF  Most Valuable Player  Cy Young Award  \
0             NaN  NaN                   NaN             NaN   


In [137]:
def bats_throws(col):
    if col == "R":
        return 1
    else:
        return 0
        
df['bats_R'] = df['bats'].apply(bats_throws)
df['throws_R'] = df['throws'].apply(bats_throws)

In [138]:
pos_list = []
pos_dict = {}
for i, row in fielding_df.iterrows():
    playerID = row['playerID']
    games = row['G']
    pos = row['Pos']
    if playerID in pos_dict:
        if pos in pos_dict[playerID]:
            pos_dict[playerID][pos] = pos_dict[playerID][pos] + games
        else:
            pos_dict[playerID][pos] = games
    else:
        pos_dict[playerID] = {}
        pos_dict[playerID][pos] = games

In [139]:
print(len(player_stats))

18915


In [140]:
primary_pos_dict = {}
player_list = []
for k, v in pos_dict.items():
    playerID = k
    primary_pos_dict[playerID] = {}
    for key, val in v.items():
        primary_pos_dict[playerID]['game_count'] = 0
        if val > primary_pos_dict[playerID]['game_count']:
            primary_pos_dict[playerID]['pos'] = key
            primary_pos_dict[playerID]['game_count'] = val

In [141]:
print(len(primary_pos_dict))
print(primary_pos_dict)

18714
{'millewa03': {'pos': 'P', 'game_count': 3}, 'butlear01': {'pos': 'SS', 'game_count': 236}, 'kented01': {'pos': 'P', 'game_count': 1}, 'hackewa02': {'pos': 'P', 'game_count': 306}, 'riegeel01': {'pos': 'P', 'game_count': 13}, 'millela02': {'pos': 'OF', 'game_count': 404}, 'stallvi01': {'pos': 'SS', 'game_count': 556}, 'shorebi01': {'pos': 'P', 'game_count': 96}, 'priddje01': {'pos': 'SS', 'game_count': 19}, 'cruzju02': {'pos': 'P', 'game_count': 447}, 'woodsji01': {'pos': '3B', 'game_count': 26}, 'holmbi01': {'pos': 'C', 'game_count': 114}, 'sheanda01': {'pos': 'SS', 'game_count': 38}, 'luquedo01': {'pos': 'P', 'game_count': 550}, 'mckiran01': {'pos': 'P', 'game_count': 27}, 'fletcel01': {'pos': '1B', 'game_count': 1380}, 'goochch01': {'pos': 'SS', 'game_count': 1}, 'galloch01': {'pos': '2B', 'game_count': 5}, 'conrobi01': {'pos': '1B', 'game_count': 6}, 'taborgr01': {'pos': '2B', 'game_count': 4}, 'veddelo01': {'pos': 'P', 'game_count': 1}, 'bellro01': {'pos': 'P', 'game_count':

In [142]:
primary_pos_df = pd.DataFrame.from_dict(primary_pos_dict, orient='index')

In [143]:
primary_pos_df = primary_pos_df.drop('game_count', axis=1)

In [144]:
df = df.join(primary_pos_df,on='playerID',how='inner')

In [145]:
dummy_df = pd.get_dummies(df['pos'], prefix='pos_')
df = pd.concat([df, dummy_df], axis=1)

In [146]:
from datetime import datetime
df['debut'] =  pd.to_datetime(df['debut'])
df['finalGame'] = pd.to_datetime(df['finalGame'])

In [147]:
df['debutYear'] = pd.to_numeric(df['debut'].dt.strftime('%Y'), errors='coerce')
df['finalYear'] = pd.to_numeric(df['finalGame'].dt.strftime('%Y'), errors='coerce')

In [148]:
df['YSR'] = 2016 - df['finalYear']

In [168]:
df = df[df['YSR'] > 15]

In [202]:
df_hof = df[df['HoF'] == 1]
print(df_hof)
print(len(df_hof))

        playerID nameFirst     nameLast bats throws      debut  finalGame  \
1      aaronha01      Hank        Aaron    R      R 1954-04-13 1976-10-03   
177    alexape01      Pete    Alexander    R      R 1911-04-15 1930-05-28   
389    ansonca01       Cap        Anson    R      R 1871-05-06 1897-10-03   
398    aparilu01      Luis     Aparicio    R      R 1956-04-17 1973-09-28   
405    applilu01      Luke      Appling    R      R 1930-09-10 1950-10-01   
480    ashburi01    Richie      Ashburn    L      R 1948-04-20 1962-09-30   
541    averiea01      Earl      Averill    L      R 1929-04-16 1941-04-25   
642    bakerfr01  Home Run        Baker    L      R 1908-09-21 1922-09-29   
702    bancrda01      Dave     Bancroft    B      R 1915-04-14 1930-05-31   
714    bankser01     Ernie        Banks    R      R 1953-09-17 1971-09-26   
1028   becklja01      Jake      Beckley    L      L 1888-06-20 1907-06-15   
1133   benchjo01    Johnny        Bench    R      R 1967-08-28 1983-09-29   

In [150]:
#Creating HoF Eligibility Column
def eligibility(y):
    if y <= 15:
        return 1
    else:
        return 0
    
df['stillEligible'] = df['YSR'].apply(eligibility)

In [151]:
print(df['stillEligible'].head())

1    0
2    0
3    0
4    1
6    0
Name: stillEligible, dtype: int64


In [152]:
print(df.head())

    playerID nameFirst nameLast bats throws      debut  finalGame  3B  \
1  aaronha01      Hank    Aaron    R      R 1954-04-13 1976-10-03  98   
2  aaronto01    Tommie    Aaron    R      R 1962-04-10 1971-09-26   6   
3   aasedo01       Don     Aase    R      R 1977-07-26 1990-10-03   0   
4   abadan01      Andy     Abad    L      L 2001-09-10 2006-04-13   0   
6  abadijo01      John   Abadie    R      R 1875-04-26 1875-06-10   0   

   InnOutsf  SBfc      ...        pos__2B  pos__3B  pos__C  pos__OF  pos__P  \
1   78413.0   NaN      ...            1.0      0.0     0.0      0.0     0.0   
2    6472.0   NaN      ...            1.0      0.0     0.0      0.0     0.0   
3    3328.0   NaN      ...            0.0      0.0     0.0      0.0     1.0   
4     138.0   NaN      ...            0.0      0.0     0.0      0.0     0.0   
6       NaN   NaN      ...            0.0      0.0     0.0      0.0     0.0   

   pos__SS  debutYear  finalYear    YSR  stillEligible  
1      0.0     1954.0     197

In [169]:
print(df.columns)

Index(['playerID', 'nameFirst', 'nameLast', 'bats', 'throws', 'debut',
       'finalGame', '3B', 'InnOutsf', 'SBfc', 'SH', 'CSfc', 'CS', 'HR', 'POf',
       '2B', 'WPfc', 'Af', 'Gf', 'SO', 'BB', 'GIDP', 'HBP', 'G', 'SB', 'H',
       'GSf', 'AB', 'PBfc', 'Ef', 'DPf', 'R', 'RBI', 'IBB', 'Wp', 'SFp', 'GFp',
       'Gp', 'WPp', 'HBPp', 'SHp', 'CGp', 'SOp', 'Lp', 'HRp', 'SHOp', 'SVp',
       'IPOutsp', 'BFPp', 'GIDPp', 'Hp', 'Rp', 'IBBp', 'BKp', 'ERp', 'BBp',
       'GSp', 'AS_games', 'Gold Glove', 'Silver Slugger', 'HoF',
       'Most Valuable Player', 'Cy Young Award', 'Rookie of the Year',
       'playerIDmstr', 'bats_R', 'throws_R', 'pos', 'pos__1B', 'pos__2B',
       'pos__3B', 'pos__C', 'pos__OF', 'pos__P', 'pos__SS', 'debutYear',
       'finalYear', 'YSR', 'stillEligible'],
      dtype='object')


In [187]:
numeric_cols = ['Rp', 'SVp', 'RBI', 'BFPp', 'HRp', 'SHOp', 'CSfc', 'SB', 'GSp', 'CS', 'Ef', 'R', 'IPOutsp',
       'PBfc', 'BB', 'SBfc', 'Af', '2B', '3B', 'SO',
       'HR', 'CGp', 'WPfc', 'Lp', 'G', 'IBB',
       'H', 'SOp', 'AB', 'ERp', 'Wp', 'AS_games', 'HoF', 'Gold Glove', 'Rookie of the Year',
       'Silver Slugger', 'Most Valuable Player', 'Cy Young Award', 'bats_R',
       'throws_R','pos__1B', 'pos__2B',
       'pos__3B', 'pos__C', 'pos__OF', 'pos__P', 'pos__SS', 'YSR']
data = df[numeric_cols]

In [188]:
numeric_cols1 = ['Rp', 'SVp', 'GIDP', 'RBI', 'IBBp', 'BFPp', 'InnOutsf',
       'HRp', 'SHOp', 'CSfc', 'SB', 'GSp', 'CS', 'Ef', 'R', 'BKp', 'IPOutsp',
       'WPp', 'PBfc', 'BB', 'SBfc', 'Af', 'HBPp', '2B', '3B', 'SFp', 'SO',
       'HR', 'DPf', 'POf', 'Gf', 'CGp', 'WPfc', 'Lp', 'Gp', 'SH', 'G', 'IBB',
       'H', 'SOp', 'AB', 'ERp', 'Hp', 'GSf', 'SHp', 'HBP', 'BBp', 'GIDPp',
       'Wp', 'GFp', 'AS_games', 'HoF', 'Gold Glove', 'Rookie of the Year',
       'Silver Slugger', 'Most Valuable Player', 'Cy Young Award', 'bats_R',
       'throws_R','pos__1B', 'pos__2B',
       'pos__3B', 'pos__C', 'pos__OF', 'pos__P', 'pos__SS', 'YSR']

In [189]:
data = data.fillna(0)

In [190]:
print(data.head())

      Rp   SVp     RBI    BFPp   HRp  SHOp  CSfc     SB   GSp    CS  ...    \
1    0.0   0.0  2297.0     0.0   0.0   0.0   0.0  240.0   0.0  73.0  ...     
2    0.0   0.0    94.0     0.0   0.0   0.0   0.0    9.0   0.0   8.0  ...     
3  503.0  82.0     0.0  4730.0  89.0   5.0   0.0    0.0  91.0   0.0  ...     
6    0.0   0.0     5.0     0.0   0.0   0.0   0.0    1.0   0.0   0.0  ...     
7    0.0   0.0   324.0     0.0   0.0   0.0   0.0  142.0   0.0   0.0  ...     

   bats_R  throws_R  pos__1B  pos__2B  pos__3B  pos__C  pos__OF  pos__P  \
1       1         1      0.0      1.0      0.0     0.0      0.0     0.0   
2       1         1      0.0      1.0      0.0     0.0      0.0     0.0   
3       1         1      0.0      0.0      0.0     0.0      0.0     1.0   
6       1         1      1.0      0.0      0.0     0.0      0.0     0.0   
7       1         1      0.0      0.0      0.0     0.0      0.0     0.0   

   pos__SS    YSR  
1      0.0   40.0  
2      0.0   45.0  
3      0.0   26.0  


In [191]:
target = data['HoF']
features = data.drop('HoF', axis=1)

In [192]:
print(target)

1        1.0
2        0.0
3        0.0
6        0.0
7        0.0
8        0.0
9        0.0
10       0.0
11       0.0
12       0.0
14       0.0
16       0.0
17       0.0
19       0.0
20       0.0
22       0.0
24       0.0
25       0.0
26       0.0
27       0.0
28       0.0
29       0.0
30       0.0
31       0.0
32       0.0
34       0.0
39       0.0
45       0.0
46       0.0
48       0.0
        ... 
19063    0.0
19064    0.0
19065    0.0
19066    0.0
19067    0.0
19068    0.0
19069    0.0
19070    0.0
19071    0.0
19072    0.0
19074    0.0
19076    0.0
19079    0.0
19080    0.0
19081    0.0
19082    0.0
19083    0.0
19085    0.0
19086    0.0
19088    0.0
19089    0.0
19092    0.0
19093    0.0
19094    0.0
19095    0.0
19099    0.0
19100    0.0
19101    0.0
19102    0.0
19103    0.0
Name: HoF, dtype: float64


## Don't forget to deal with class imbalance.

In [193]:
from sklearn.cross_validation import cross_val_predict, KFold
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight='balanced')
kf = KFold(features.shape[0], random_state=1)

predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)

In [194]:
tn_filter = (predictions == 0) & (data['HoF'] == 0)
tn = len(predictions[tn_filter])

tp_filter = (predictions == 1) & (data['HoF'] == 1)
tp = len(predictions[tp_filter])

fn_filter = (predictions == 0) & (data['HoF'] == 1)
fn = len(predictions[fn_filter])

fp_filter = (predictions == 1) & (data['HoF'] == 0)
fp = len(predictions[fp_filter])

In [195]:
print(tn)
print(tp)
print(fn)
print(fp)
fpr = fp / (fp + tn)
tpr = tp / (tp + fn)
print(fpr)
print(tpr)

9856
5
144
376
0.03674745895230649
0.03355704697986577


In [196]:
print(len(target))
print(len(df))
print(len(predictions))

14070
14070
14070


In [197]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=1, class_weight='balanced')
predictions = cross_val_predict(rf, features, target, cv=kf)
predictions= pd.Series(predictions)

In [198]:
tn_filter = (predictions == 0) & (data['HoF'] == 0)
tn = len(predictions[tn_filter])

tp_filter = (predictions == 1) & (data['HoF'] == 1)
tp = len(predictions[tp_filter])

fn_filter = (predictions == 0) & (data['HoF'] == 1)
fn = len(predictions[fn_filter])

fp_filter = (predictions == 1) & (data['HoF'] == 0)
fp = len(predictions[fp_filter])

In [199]:
print(tn)
print(tp)
print(fn)
print(fp)

10156
1
148
76


In [200]:
fpr = fp / (fp + tn)
tpr = tp / (tp + fn)

In [201]:
print(fpr)
print(tpr)

0.007427677873338546
0.006711409395973154
