## Part II: Predicting Hall of Fame Careers

In [70]:
# Connecting to SQLite Database
import pandas as pd
import sqlite3
conn = sqlite3.connect("lahman2016.sqlite")

In [71]:
queryMaster = 'select playerID,nameFirst,nameLast,bats,throws,debut,finalGame from Master;'
queryPitching = 'select * from Pitching;'
queryFielding = 'select * from Fielding;'
queryAwards = 'select playerID,awardID,yearID from AwardsPlayers;'
queryAllStar = 'select playerID, YearID from AllstarFull;'
queryHOF = 'select playerID,yearid from HallofFame where inducted == "Y" and category == "Player";'

In [72]:
batting_df = pd.read_csv('Batting.csv')

In [73]:
master = conn.execute(queryMaster).fetchall()
master_df = pd.DataFrame(master)
pitching = conn.execute(queryPitching).fetchall()
pitching_df = pd.DataFrame(pitching)
fielding = conn.execute(queryFielding).fetchall()
fielding_df = pd.DataFrame(fielding)
awards = conn.execute(queryAwards).fetchall()
awards_df = pd.DataFrame(awards)
allstar = conn.execute(queryAllStar).fetchall()
allstar_df = pd.DataFrame(allstar)
hof = conn.execute(queryHOF).fetchall()
hof_df = pd.DataFrame(hof)

In [74]:
master_cols = ['playerID','nameFirst','nameLast','bats','throws','debut','finalGame']
master_df.columns = master_cols

pitching_cols = ['playerID','yearID','stint','teamID','lgID','W','L','G','GS','CG','SHO',
'SV','IPOuts','H','ER','HR','BB','SO','BAOpp','ERA','IBB','WP','HBP','BK','BFP','GF','R','SH','SF','GIDP']
pitching_df.columns = pitching_cols

fielding_cols = ['playerID','yearID','stint','teamID','lgID','Pos','G','GS','InnOuts','PO','A','E','DP','PB','WP','SB','CS','ZR']
fielding_df.columns = fielding_cols

awards_cols = ['playerID','awardID','yearID']
awards_df.columns = awards_cols

allstar_cols = ['playerID','YearID']
allstar_df.columns = allstar_cols

hof_cols = ['playerID','yearid']
hof_df.columns = hof_cols

In [75]:
print(batting_df.head())

    playerID  yearID  stint teamID lgID   G   AB   R   H  2B  ...    RBI   SB  \
0  abercda01    1871      1    TRO  NaN   1    4   0   0   0  ...    0.0  0.0   
1   addybo01    1871      1    RC1  NaN  25  118  30  32   6  ...   13.0  8.0   
2  allisar01    1871      1    CL1  NaN  29  137  28  40   4  ...   19.0  3.0   
3  allisdo01    1871      1    WS3  NaN  27  133  28  44  10  ...   27.0  1.0   
4  ansonca01    1871      1    RC1  NaN  25  120  29  39  11  ...   16.0  6.0   

    CS  BB   SO  IBB  HBP  SH  SF  GIDP  
0  0.0   0  0.0  NaN  NaN NaN NaN   NaN  
1  1.0   4  0.0  NaN  NaN NaN NaN   NaN  
2  1.0   2  5.0  NaN  NaN NaN NaN   NaN  
3  1.0   0  2.0  NaN  NaN NaN NaN   NaN  
4  2.0   2  1.0  NaN  NaN NaN NaN   NaN  

[5 rows x 22 columns]


In [76]:
player_stats = {}
for i, row in batting_df.iterrows():
    playerID = row['playerID']
    G = row['G']
    AB = row['AB']
    R = row['R']
    H = row['H']
    twoB = row['2B']
    threeB = row['3B']
    HR = row['HR']
    RBI = row['RBI']
    SB = row['SB']
    CS = row['CS']
    BB = row['BB']
    SO = row['SO']
    IBB = row['IBB']
    HBP = row['HBP']
    SH = row['SH']
    SF = row['SF']
    GIDP = row['GIDP']
    if playerID in player_stats:
        player_stats[playerID]['G'] = player_stats[playerID]['G'] + G
        player_stats[playerID]['AB'] = player_stats[playerID]['AB'] + AB
        player_stats[playerID]['R'] = player_stats[playerID]['R'] + R
        player_stats[playerID]['H'] = player_stats[playerID]['H'] + H
        player_stats[playerID]['2B'] = player_stats[playerID]['2B'] + twoB
        player_stats[playerID]['3B'] = player_stats[playerID]['3B'] + threeB
        player_stats[playerID]['HR'] = player_stats[playerID]['HR'] + HR
        player_stats[playerID]['RBI'] = player_stats[playerID]['RBI'] + RBI
        player_stats[playerID]['SB'] = player_stats[playerID]['SB'] + SB
        player_stats[playerID]['CS'] = player_stats[playerID]['CS'] + CS
        player_stats[playerID]['BB'] = player_stats[playerID]['BB'] + BB
        player_stats[playerID]['SO'] = player_stats[playerID]['SO'] + SO
        player_stats[playerID]['IBB'] = player_stats[playerID]['IBB'] + IBB
        player_stats[playerID]['HBP'] = player_stats[playerID]['HBP'] + HBP
        player_stats[playerID]['SH'] = player_stats[playerID]['SH'] + SH
        player_stats[playerID]['GIDP'] = player_stats[playerID]['GIDP'] + GIDP
    else:
        player_stats[playerID] = {}
        player_stats[playerID]['G'] = G
        player_stats[playerID]['AB'] = AB
        player_stats[playerID]['R'] = R
        player_stats[playerID]['H'] = H
        player_stats[playerID]['2B'] = twoB
        player_stats[playerID]['3B'] = threeB
        player_stats[playerID]['HR'] = HR
        player_stats[playerID]['RBI'] = RBI
        player_stats[playerID]['SB'] = SB
        player_stats[playerID]['CS'] = CS
        player_stats[playerID]['BB'] = BB
        player_stats[playerID]['SO'] = SO
        player_stats[playerID]['IBB'] = IBB
        player_stats[playerID]['HBP'] = HBP
        player_stats[playerID]['SH'] = SH
        player_stats[playerID]['GIDP'] = GIDP        


In [77]:
pitcher_list = []
for i, row in pitching_df.iterrows():
    playerID = row['playerID']
    Wp = row['W']
    Lp = row['L']
    Gp = row['G']
    GSp = row['GS']
    CGp = row['CG']
    SHOp = row['SHO']
    SVp = row['SV']
    IPOutsp = row['IPOuts']
    Hp = row['H']
    ERp = row['ER']
    HRp = row['HR']
    BBp = row['BB']
    SOp = row['SO']
    IBBp = row['IBB']
    WPp = row['WP']
    HBPp = row['HBP']
    BKp = row['BK']
    BFPp = row['BFP']
    GFp = row['GF']
    Rp = row['R']
    SHp = row['SH']
    SFp = row['SF']
    GIDPp = row['GIDP']
    if playerID in player_stats and playerID in pitcher_list:
        player_stats[playerID]['Wp'] = player_stats[playerID]['Wp'] + Wp
        player_stats[playerID]['Lp'] = player_stats[playerID]['Lp'] + Lp
        player_stats[playerID]['Gp'] = player_stats[playerID]['Gp'] + Gp
        player_stats[playerID]['GSp'] = player_stats[playerID]['GSp'] + GSp
        player_stats[playerID]['CGp'] = player_stats[playerID]['CGp'] + CGp
        player_stats[playerID]['SHOp'] = player_stats[playerID]['SHOp'] + SHOp
        player_stats[playerID]['SVp'] = player_stats[playerID]['SVp'] + SVp
        player_stats[playerID]['IPOutsp'] = player_stats[playerID]['IPOutsp'] + IPOutsp
        player_stats[playerID]['Hp'] = player_stats[playerID]['Hp'] + Hp
        player_stats[playerID]['ERp'] = player_stats[playerID]['ERp'] + ERp
        player_stats[playerID]['HRp'] = player_stats[playerID]['HRp'] + HRp
        player_stats[playerID]['BBp'] = player_stats[playerID]['BBp'] + BBp
        player_stats[playerID]['SOp'] = player_stats[playerID]['SOp'] + SOp
        player_stats[playerID]['IBBp'] = player_stats[playerID]['IBBp'] + IBBp
        player_stats[playerID]['WPp'] = player_stats[playerID]['WPp'] + WPp
        player_stats[playerID]['HBPp'] = player_stats[playerID]['HBPp'] + HBPp
        player_stats[playerID]['BKp'] = player_stats[playerID]['BKp'] + BKp
        player_stats[playerID]['BFPp'] = player_stats[playerID]['BFPp'] + BFPp
        player_stats[playerID]['GFp'] = player_stats[playerID]['GFp'] + GFp
        player_stats[playerID]['Rp'] = player_stats[playerID]['Rp'] + Rp
        player_stats[playerID]['SHp'] = player_stats[playerID]['SHp'] + SHp
        player_stats[playerID]['SFp'] = player_stats[playerID]['SFp'] + SFp
        player_stats[playerID]['GIDPp'] = player_stats[playerID]['GIDPp'] + GIDPp
    else:
        pitcher_list.append(playerID)
        player_stats[playerID]['Wp'] = Wp
        player_stats[playerID]['Lp'] = Lp
        player_stats[playerID]['Gp'] = Gp
        player_stats[playerID]['GSp'] = GSp
        player_stats[playerID]['CGp'] = CGp
        player_stats[playerID]['SHOp'] = SHOp
        player_stats[playerID]['SVp'] = SVp
        player_stats[playerID]['IPOutsp'] = IPOutsp
        player_stats[playerID]['Hp'] = Hp
        player_stats[playerID]['ERp'] = ERp
        player_stats[playerID]['HRp'] = HRp
        player_stats[playerID]['BBp'] = BBp
        player_stats[playerID]['SOp'] = SOp
        player_stats[playerID]['IBBp'] = IBBp
        player_stats[playerID]['WPp'] = WPp
        player_stats[playerID]['HBPp'] = HBPp
        player_stats[playerID]['BKp'] = BKp
        player_stats[playerID]['BFPp'] = BFPp
        player_stats[playerID]['GFp'] = GFp
        player_stats[playerID]['Rp'] = Rp
        player_stats[playerID]['SHp'] = SHp
        player_stats[playerID]['SFp'] = SFp
        player_stats[playerID]['GIDPp'] = GIDPp

        

In [78]:
fielder_list = []
for i, row in fielding_df.iterrows():
    playerID = row['playerID']
    Gf = row['G']
    GSf = row['GS']
    InnOutsf = row['InnOuts']
    POf = row['PO']
    Af = row['A']
    Ef = row['E']
    DPf = row['DP']
    PBfc = row['PB']
    WPfc = row['WP']
    SBfc = row['SB']
    CSfc = row['CS']
    if playerID in player_stats and playerID in fielder_list:
        player_stats[playerID]['Gf'] = player_stats[playerID]['Gf'] + Gf
        player_stats[playerID]['GSf'] = player_stats[playerID]['GSf'] + GSf
        player_stats[playerID]['InnOutsf'] = player_stats[playerID]['InnOutsf'] + InnOutsf
        player_stats[playerID]['POf'] = player_stats[playerID]['POf'] + POf
        player_stats[playerID]['Af'] = player_stats[playerID]['Af'] + Af
        player_stats[playerID]['Ef'] = player_stats[playerID]['Ef'] + Ef
        player_stats[playerID]['DPf'] = player_stats[playerID]['DPf'] + DPf
        player_stats[playerID]['PBfc'] = player_stats[playerID]['PBfc'] + PBfc
        player_stats[playerID]['WPfc'] = player_stats[playerID]['WPfc'] + WPfc
        player_stats[playerID]['SBfc'] = player_stats[playerID]['SBfc'] + SBfc
        player_stats[playerID]['CSfc'] = player_stats[playerID]['CSfc'] + CSfc
    else:
        fielder_list.append(playerID)
        player_stats[playerID]['Gf'] = Gf
        player_stats[playerID]['GSf'] = GSf
        player_stats[playerID]['InnOutsf'] = InnOutsf
        player_stats[playerID]['POf'] = POf
        player_stats[playerID]['Af'] = Af
        player_stats[playerID]['Ef'] = Ef
        player_stats[playerID]['DPf'] = DPf
        player_stats[playerID]['PBfc'] = PBfc
        player_stats[playerID]['WPfc'] = WPfc
        player_stats[playerID]['SBfc'] = SBfc
        player_stats[playerID]['CSfc'] = CSfc

In [79]:
print(awards_df['awardID'].unique())

['Pitching Triple Crown' 'Triple Crown' 'Baseball Magazine All-Star'
 'Most Valuable Player' 'TSN All-Star' 'TSN Guide MVP'
 'TSN Major League Player of the Year' 'TSN Pitcher of the Year'
 'TSN Player of the Year' 'Rookie of the Year' 'Babe Ruth Award'
 'Lou Gehrig Memorial Award' 'World Series MVP' 'Cy Young Award'
 'Gold Glove' 'TSN Fireman of the Year' 'All-Star Game MVP' 'Hutch Award'
 'Roberto Clemente Award' 'Rolaids Relief Man Award' 'NLCS MVP' 'ALCS MVP'
 'Silver Slugger' 'Branch Rickey Award' 'Hank Aaron Award'
 'TSN Reliever of the Year' 'Comeback Player of the Year'
 'Outstanding DH Award' 'Reliever of the Year Award']


In [80]:
mvp_df = awards_df[awards_df['awardID'] == 'Most Valuable Player']
roy_df = awards_df[awards_df['awardID'] == 'Rookie of the Year']
cy_df = awards_df[awards_df['awardID'] == 'Cy Young Award']
gg_df = awards_df[awards_df['awardID'] == 'Gold Glove']
ss_df = awards_df[awards_df['awardID'] == 'Silver Slugger']

awards_list = [mvp_df,roy_df,cy_df,gg_df,ss_df]

In [81]:
print(awards_list[1].head())

       playerID             awardID  yearID
1796  robinja02  Rookie of the Year    1947
1845   darkal01  Rookie of the Year    1948
1899  sievero01  Rookie of the Year    1949
1900  newcodo01  Rookie of the Year    1949
1953  dropowa01  Rookie of the Year    1950


In [82]:
mvp_list = []
roy_list = []
cy_list = []
gg_list = []
ss_list = []
lists = [mvp_list,roy_list,cy_list,gg_list,ss_list]

In [83]:
for index, v in enumerate(awards_list):
    for i, row in v.iterrows():
        playerID = row['playerID']
        award = row['awardID']
        if playerID in player_stats and playerID in lists[index]:
            player_stats[playerID][award] += 1
        else:
            lists[index].append(playerID)
            player_stats[playerID][award] = 1

In [84]:
allstar_list = []
for i, row in allstar_df.iterrows():
    playerID = row['playerID']
    if playerID in player_stats and playerID in allstar_list:
        player_stats[playerID]['AS_games'] += 1
    else:
        allstar_list.append(playerID)
        player_stats[playerID]['AS_games'] = 1

In [85]:
for i, row in hof_df.iterrows():
    playerID = row['playerID']
    if playerID in player_stats:
        player_stats[playerID]['HoF'] = 1

In [86]:
print(len(hof_df))
print(len(master_df))

250
19105


In [87]:
print(dict(list(player_stats.items())[0:2]))

{'saverjo01': {'GSf': 0.0, 'HBP': 0.0, 'InnOutsf': 155.0, 'Wp': 3, 'H': 0, 'HR': 0, 'WPp': 0.0, 'HRp': 5, 'ERp': 22, 'Lp': 2, 'SO': 1.0, 'IBBp': 1.0, 'CGp': 0, 'Af': 10.0, 'Ef': 2.0, 'CSfc': nan, 'GIDPp': 3.0, 'PBfc': nan, 'SH': 1.0, 'R': 0, 'BFPp': 218.0, 'G': 44, 'Gp': 44, '2B': 0, 'SOp': 32, 'BKp': 1, 'Hp': 45, 'SFp': 3.0, 'CS': 0.0, 'SBfc': nan, 'AB': 2, 'SB': 0.0, 'GIDP': 0.0, 'IBB': 0.0, 'GFp': 19.0, 'Gf': 44, 'SHp': 4.0, 'IPOutsp': 155, 'WPfc': nan, 'Rp': 28, 'HBPp': 1.0, 'SVp': 0, '3B': 0, 'BBp': 20, 'SHOp': 0, 'BB': 0, 'POf': 2, 'DPf': 0.0, 'GSp': 0, 'RBI': 0.0}, 'floredo01': {'GSf': 0.0, 'HBP': 0.0, 'InnOutsf': 36.0, 'Wp': 3, 'H': 0, 'HR': 0, 'WPp': 0.0, 'HRp': 0, 'ERp': 2, 'Lp': 0, 'SO': 1.0, 'IBBp': 0.0, 'CGp': 0, 'Af': 4.0, 'Ef': 0.0, 'CSfc': nan, 'GIDPp': nan, 'PBfc': nan, 'SH': 0.0, 'R': 0, 'BFPp': 57.0, 'G': 14, 'Gp': 14, '2B': 0, 'SOp': 5, 'BKp': 0, 'Hp': 17, 'SFp': nan, 'CS': 0.0, 'SBfc': nan, 'AB': 1, 'SB': 0.0, 'GIDP': 0.0, 'IBB': 0.0, 'GFp': 3.0, 'Gf': 14, 'SHp': n

In [88]:
stats_df = pd.DataFrame.from_dict(player_stats, orient='index')

In [89]:
stats_df['playerID'] = stats_df.index

In [90]:
print(stats_df.head())

              GSf   HBP  InnOutsf    Wp     H   HR   WPp   HRp    ERp    Lp  \
aardsda01     0.0   0.0    1011.0  16.0     0    0  12.0  41.0  160.0  18.0   
aaronha01  2977.0  32.0   78413.0   NaN  3771  755   NaN   NaN    NaN   NaN   
aaronto01   206.0   0.0    6472.0   NaN   216   13   NaN   NaN    NaN   NaN   
aasedo01     91.0   0.0    3328.0  66.0     0    0  21.0  89.0  468.0  60.0   
abadan01      4.0   0.0     138.0   NaN     2    0   NaN   NaN    NaN   NaN   

             ...       GSp     RBI  AS_games  Gold Glove  HoF  Cy Young Award  \
aardsda01    ...       0.0     0.0       NaN         NaN  NaN             NaN   
aaronha01    ...       NaN  2297.0      25.0         3.0  1.0             NaN   
aaronto01    ...       NaN    94.0       NaN         NaN  NaN             NaN   
aasedo01     ...      91.0     0.0       1.0         NaN  NaN             NaN   
abadan01     ...       NaN     0.0       NaN         NaN  NaN             NaN   

           Most Valuable Player  Silve

In [91]:
print(master_df.head())

    playerID nameFirst nameLast bats throws       debut   finalGame
0  aardsda01     David  Aardsma    R      R  2004-04-06  2015-08-23
1  aaronha01      Hank    Aaron    R      R  1954-04-13  1976-10-03
2  aaronto01    Tommie    Aaron    R      R  1962-04-10  1971-09-26
3   aasedo01       Don     Aase    R      R  1977-07-26  1990-10-03
4   abadan01      Andy     Abad    L      L  2001-09-10  2006-04-13


In [92]:
df = master_df.join(stats_df,on='playerID',how='inner',rsuffix='mstr')

In [93]:
print(df.head())
print(len(stats_df))
print(len(df))
print(len(master_df))

    playerID nameFirst nameLast bats throws       debut   finalGame     GSf  \
0  aardsda01     David  Aardsma    R      R  2004-04-06  2015-08-23     0.0   
1  aaronha01      Hank    Aaron    R      R  1954-04-13  1976-10-03  2977.0   
2  aaronto01    Tommie    Aaron    R      R  1962-04-10  1971-09-26   206.0   
3   aasedo01       Don     Aase    R      R  1977-07-26  1990-10-03    91.0   
4   abadan01      Andy     Abad    L      L  2001-09-10  2006-04-13     4.0   

    HBP  InnOutsf      ...        GSp     RBI  AS_games  Gold Glove  HoF  \
0   0.0    1011.0      ...        0.0     0.0       NaN         NaN  NaN   
1  32.0   78413.0      ...        NaN  2297.0      25.0         3.0  1.0   
2   0.0    6472.0      ...        NaN    94.0       NaN         NaN  NaN   
3   0.0    3328.0      ...       91.0     0.0       1.0         NaN  NaN   
4   0.0     138.0      ...        NaN     0.0       NaN         NaN  NaN   

   Cy Young Award  Most Valuable Player  Silver Slugger  Rookie of t

In [94]:
def bats_throws(col):
    if col == "R":
        return 1
    else:
        return 0
        
df['bats_R'] = df['bats'].apply(bats_throws)
df['throws_R'] = df['throws'].apply(bats_throws)

In [95]:
pos_list = []
pos_dict = {}
for i, row in fielding_df.iterrows():
    playerID = row['playerID']
    games = row['G']
    pos = row['Pos']
    if playerID in pos_dict:
        if pos in pos_dict[playerID]:
            pos_dict[playerID][pos] = pos_dict[playerID][pos] + games
        else:
            pos_dict[playerID][pos] = games
    else:
        pos_dict[playerID] = {}
        pos_dict[playerID][pos] = games

In [96]:
print(len(player_stats))

18915


In [97]:
primary_pos_dict = {}
player_list = []
for k, v in pos_dict.items():
    playerID = k
    primary_pos_dict[playerID] = {}
    for key, val in v.items():
        primary_pos_dict[playerID]['game_count'] = 0
        if val > primary_pos_dict[playerID]['game_count']:
            primary_pos_dict[playerID]['pos'] = key
            primary_pos_dict[playerID]['game_count'] = val

In [98]:
print(len(primary_pos_dict))
print(primary_pos_dict)

18714
{'floredo01': {'pos': 'P', 'game_count': 14}, 'tankede01': {'pos': 'P', 'game_count': 27}, 'cookgl01': {'pos': 'P', 'game_count': 9}, 'damicje02': {'pos': 'P', 'game_count': 7}, 'nixja01': {'pos': '2B', 'game_count': 133}, 'maysca01': {'pos': 'P', 'game_count': 490}, 'irvined01': {'pos': '3B', 'game_count': 1}, 'farrst01': {'pos': 'P', 'game_count': 509}, 'powerdo01': {'pos': 'C', 'game_count': 594}, 'seradbi01': {'pos': 'P', 'game_count': 95}, 'vonkofr01': {'pos': '3B', 'game_count': 51}, 'puhlte01': {'pos': 'OF', 'game_count': 1300}, 'macksh01': {'pos': 'OF', 'game_count': 830}, 'dottedu01': {'pos': 'C', 'game_count': 101}, 'cruzvi01': {'pos': 'P', 'game_count': 187}, 'chambad01': {'pos': 'OF', 'game_count': 51}, 'penneke01': {'pos': 'P', 'game_count': 9}, 'bellju01': {'pos': '2B', 'game_count': 156}, 'sulliji01': {'pos': 'P', 'game_count': 70}, 'ricebo01': {'pos': '2B', 'game_count': 2}, 'podsesc01': {'pos': 'OF', 'game_count': 974}, 'blankla02': {'pos': '2B', 'game_count': 19

In [99]:
primary_pos_df = pd.DataFrame.from_dict(primary_pos_dict, orient='index')

In [100]:
primary_pos_df = primary_pos_df.drop('game_count', axis=1)

In [101]:
df = df.join(primary_pos_df,on='playerID',how='inner')

In [102]:
dummy_df = pd.get_dummies(df['pos'], prefix='pos_')
df = pd.concat([df, dummy_df], axis=1)

In [103]:
print(df.head())

    playerID nameFirst nameLast bats throws       debut   finalGame     GSf  \
0  aardsda01     David  Aardsma    R      R  2004-04-06  2015-08-23     0.0   
1  aaronha01      Hank    Aaron    R      R  1954-04-13  1976-10-03  2977.0   
2  aaronto01    Tommie    Aaron    R      R  1962-04-10  1971-09-26   206.0   
3   aasedo01       Don     Aase    R      R  1977-07-26  1990-10-03    91.0   
4   abadan01      Andy     Abad    L      L  2001-09-10  2006-04-13     4.0   

    HBP  InnOutsf   ...     bats_R  throws_R  pos  pos__1B  pos__2B  pos__3B  \
0   0.0    1011.0   ...          1         1    P      0.0      0.0      0.0   
1  32.0   78413.0   ...          1         1   2B      0.0      1.0      0.0   
2   0.0    6472.0   ...          1         1   2B      0.0      1.0      0.0   
3   0.0    3328.0   ...          1         1    P      0.0      0.0      0.0   
4   0.0     138.0   ...          0         0   OF      0.0      0.0      0.0   

   pos__C  pos__OF  pos__P  pos__SS  
0     

In [104]:
print(df.columns)

Index(['playerID', 'nameFirst', 'nameLast', 'bats', 'throws', 'debut',
       'finalGame', 'GSf', 'HBP', 'InnOutsf', 'Wp', 'H', 'HR', 'WPp', 'HRp',
       'ERp', 'Lp', 'SO', 'IBBp', 'CGp', 'Af', 'Ef', 'CSfc', 'GIDPp', 'PBfc',
       'SH', 'R', 'BFPp', 'G', 'Gp', '2B', 'SOp', 'BKp', 'Hp', 'SFp', 'CS',
       'SBfc', 'AB', 'SB', 'GIDP', 'IBB', 'GFp', 'Gf', 'SHp', 'IPOutsp',
       'WPfc', 'Rp', 'HBPp', 'SVp', '3B', 'BBp', 'SHOp', 'BB', 'POf', 'DPf',
       'GSp', 'RBI', 'AS_games', 'Gold Glove', 'HoF', 'Cy Young Award',
       'Most Valuable Player', 'Silver Slugger', 'Rookie of the Year',
       'playerIDmstr', 'bats_R', 'throws_R', 'pos', 'pos__1B', 'pos__2B',
       'pos__3B', 'pos__C', 'pos__OF', 'pos__P', 'pos__SS'],
      dtype='object')


In [105]:
numeric_cols = ['Rp', 'SVp', 'GIDP', 'RBI', 'IBBp', 'BFPp', 'InnOutsf',
       'HRp', 'SHOp', 'CSfc', 'SB', 'GSp', 'CS', 'Ef', 'R', 'BKp', 'IPOutsp',
       'WPp', 'PBfc', 'BB', 'SBfc', 'Af', 'HBPp', '2B', '3B', 'SFp', 'SO',
       'HR', 'DPf', 'POf', 'Gf', 'CGp', 'WPfc', 'Lp', 'Gp', 'SH', 'G', 'IBB',
       'H', 'SOp', 'AB', 'ERp', 'Hp', 'GSf', 'SHp', 'HBP', 'BBp', 'GIDPp',
       'Wp', 'GFp', 'AS_games', 'HoF', 'Gold Glove', 'Rookie of the Year',
       'Silver Slugger', 'Most Valuable Player', 'Cy Young Award', 'bats_R',
       'throws_R','pos__1B', 'pos__2B',
       'pos__3B', 'pos__C', 'pos__OF', 'pos__P', 'pos__SS']
data = df[numeric_cols]

In [106]:
data = data.fillna(0)

In [107]:
print(data.head())

      Rp   SVp   GIDP     RBI  IBBp    BFPp  InnOutsf   HRp  SHOp  CSfc  \
0  169.0  69.0    0.0     0.0  22.0  1475.0    1011.0  41.0   0.0   0.0   
1    0.0   0.0  328.0  2297.0   0.0     0.0   78413.0   0.0   0.0   0.0   
2    0.0   0.0   36.0    94.0   0.0     0.0    6472.0   0.0   0.0   0.0   
3  503.0  82.0    0.0     0.0  45.0  4730.0    3328.0  89.0   5.0   0.0   
4    0.0   0.0    1.0     0.0   0.0     0.0     138.0   0.0   0.0   0.0   

    ...     Cy Young Award  bats_R  throws_R  pos__1B  pos__2B  pos__3B  \
0   ...                0.0       1         1      0.0      0.0      0.0   
1   ...                0.0       1         1      0.0      1.0      0.0   
2   ...                0.0       1         1      0.0      1.0      0.0   
3   ...                0.0       1         1      0.0      0.0      0.0   
4   ...                0.0       0         0      0.0      0.0      0.0   

   pos__C  pos__OF  pos__P  pos__SS  
0     0.0      0.0     1.0      0.0  
1     0.0      0.0    

In [108]:
features = data.drop('HoF', axis=1)
target = data['HoF']

In [109]:
print(target)

0        0.0
1        1.0
2        0.0
3        0.0
4        0.0
5        0.0
6        0.0
7        0.0
8        0.0
9        0.0
10       0.0
11       0.0
12       0.0
13       0.0
14       0.0
15       0.0
16       0.0
17       0.0
18       0.0
19       0.0
20       0.0
21       0.0
22       0.0
23       0.0
24       0.0
25       0.0
26       0.0
27       0.0
28       0.0
29       0.0
        ... 
19075    0.0
19076    0.0
19077    0.0
19078    0.0
19079    0.0
19080    0.0
19081    0.0
19082    0.0
19083    0.0
19084    0.0
19085    0.0
19086    0.0
19087    0.0
19088    0.0
19089    0.0
19090    0.0
19091    0.0
19092    0.0
19093    0.0
19094    0.0
19095    0.0
19096    0.0
19097    0.0
19098    0.0
19099    0.0
19100    0.0
19101    0.0
19102    0.0
19103    0.0
19104    0.0
Name: HoF, dtype: float64


## Don't forget to deal with class imbalance.

In [110]:
from sklearn.cross_validation import cross_val_predict, KFold
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight='balanced')
kf = KFold(features.shape[0], random_state=1)

predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)

In [111]:
tn_filter = (predictions == 0) & (data['HoF'] == 0)
tn = len(predictions[tn_filter])

tp_filter = (predictions == 1) & (data['HoF'] == 1)
tp = len(predictions[tp_filter])

fn_filter = (predictions == 0) & (data['HoF'] == 1)
fn = len(predictions[fn_filter])

fp_filter = (predictions == 1) & (data['HoF'] == 0)
fp = len(predictions[fp_filter])

In [112]:
print(tn)
print(tp)
print(fn)
print(fp)
fpr = fp / (fp + tn)
tpr = tp / (tp + fn)
print(fpr)
print(tpr)

17249
11
208
861
0.04754279403644395
0.0502283105022831


In [113]:
print(target)

0        0.0
1        1.0
2        0.0
3        0.0
4        0.0
5        0.0
6        0.0
7        0.0
8        0.0
9        0.0
10       0.0
11       0.0
12       0.0
13       0.0
14       0.0
15       0.0
16       0.0
17       0.0
18       0.0
19       0.0
20       0.0
21       0.0
22       0.0
23       0.0
24       0.0
25       0.0
26       0.0
27       0.0
28       0.0
29       0.0
        ... 
19075    0.0
19076    0.0
19077    0.0
19078    0.0
19079    0.0
19080    0.0
19081    0.0
19082    0.0
19083    0.0
19084    0.0
19085    0.0
19086    0.0
19087    0.0
19088    0.0
19089    0.0
19090    0.0
19091    0.0
19092    0.0
19093    0.0
19094    0.0
19095    0.0
19096    0.0
19097    0.0
19098    0.0
19099    0.0
19100    0.0
19101    0.0
19102    0.0
19103    0.0
19104    0.0
Name: HoF, dtype: float64


In [114]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=1, class_weight='balanced')
predictions = cross_val_predict(rf, features, target, cv=kf)
predictions= pd.Series(predictions)

In [115]:
tn_filter = (predictions == 0) & (data['HoF'] == 0)
tn = len(predictions[tn_filter])

tp_filter = (predictions == 1) & (data['HoF'] == 1)
tp = len(predictions[tp_filter])

fn_filter = (predictions == 0) & (data['HoF'] == 1)
fn = len(predictions[fn_filter])

fp_filter = (predictions == 1) & (data['HoF'] == 0)
fp = len(predictions[fp_filter])

In [116]:
print(tn)
print(tp)
print(fn)
print(fp)

18006
1
218
104


In [117]:
fpr = fp / (fp + tn)
tpr = tp / (tp + fn)

In [118]:
print(fpr)
print(tpr)

0.005742683600220872
0.0045662100456621
