## Part II: Predicting Hall of Fame Careers

# Try the model without using cross val predict.  Split into train and test like in part I to get the model right... then figure out cross val predict.

In [76]:
# Connecting to SQLite Database
import pandas as pd
import sqlite3
conn = sqlite3.connect("lahman2016.sqlite")

In [77]:
queryMaster = 'select playerID,nameFirst,nameLast,bats,throws,debut,finalGame from Master;'
queryPitching = 'select * from Pitching;'
queryFielding = 'select * from Fielding;'
queryAwards = 'select playerID,awardID,yearID from AwardsPlayers;'
queryAllStar = 'select playerID, YearID from AllstarFull;'
queryHOF = 'select playerID,yearid from HallofFame where inducted == "Y" and category == "Player";'

In [78]:
batting_df = pd.read_csv('Batting.csv')

In [79]:
master = conn.execute(queryMaster).fetchall()
master_df = pd.DataFrame(master)
pitching = conn.execute(queryPitching).fetchall()
pitching_df = pd.DataFrame(pitching)
fielding = conn.execute(queryFielding).fetchall()
fielding_df = pd.DataFrame(fielding)
awards = conn.execute(queryAwards).fetchall()
awards_df = pd.DataFrame(awards)
allstar = conn.execute(queryAllStar).fetchall()
allstar_df = pd.DataFrame(allstar)
hof = conn.execute(queryHOF).fetchall()
hof_df = pd.DataFrame(hof)

In [80]:
master_cols = ['playerID','nameFirst','nameLast','bats','throws','debut','finalGame']
master_df.columns = master_cols

pitching_cols = ['playerID','yearID','stint','teamID','lgID','W','L','G','GS','CG','SHO',
'SV','IPOuts','H','ER','HR','BB','SO','BAOpp','ERA','IBB','WP','HBP','BK','BFP','GF','R','SH','SF','GIDP']
pitching_df.columns = pitching_cols

fielding_cols = ['playerID','yearID','stint','teamID','lgID','Pos','G','GS','InnOuts','PO','A','E','DP','PB','WP','SB','CS','ZR']
fielding_df.columns = fielding_cols

awards_cols = ['playerID','awardID','yearID']
awards_df.columns = awards_cols

allstar_cols = ['playerID','YearID']
allstar_df.columns = allstar_cols

hof_cols = ['playerID','yearid']
hof_df.columns = hof_cols

In [81]:
print(batting_df.head())

    playerID  yearID  stint teamID lgID   G   AB   R   H  2B  ...    RBI   SB  \
0  abercda01    1871      1    TRO  NaN   1    4   0   0   0  ...    0.0  0.0   
1   addybo01    1871      1    RC1  NaN  25  118  30  32   6  ...   13.0  8.0   
2  allisar01    1871      1    CL1  NaN  29  137  28  40   4  ...   19.0  3.0   
3  allisdo01    1871      1    WS3  NaN  27  133  28  44  10  ...   27.0  1.0   
4  ansonca01    1871      1    RC1  NaN  25  120  29  39  11  ...   16.0  6.0   

    CS  BB   SO  IBB  HBP  SH  SF  GIDP  
0  0.0   0  0.0  NaN  NaN NaN NaN   NaN  
1  1.0   4  0.0  NaN  NaN NaN NaN   NaN  
2  1.0   2  5.0  NaN  NaN NaN NaN   NaN  
3  1.0   0  2.0  NaN  NaN NaN NaN   NaN  
4  2.0   2  1.0  NaN  NaN NaN NaN   NaN  

[5 rows x 22 columns]


In [82]:
player_stats = {}
for i, row in batting_df.iterrows():
    playerID = row['playerID']
    G = row['G']
    AB = row['AB']
    R = row['R']
    H = row['H']
    twoB = row['2B']
    threeB = row['3B']
    HR = row['HR']
    RBI = row['RBI']
    SB = row['SB']
    CS = row['CS']
    BB = row['BB']
    SO = row['SO']
    IBB = row['IBB']
    HBP = row['HBP']
    SH = row['SH']
    SF = row['SF']
    GIDP = row['GIDP']
    if playerID in player_stats:
        player_stats[playerID]['G'] = player_stats[playerID]['G'] + G
        player_stats[playerID]['AB'] = player_stats[playerID]['AB'] + AB
        player_stats[playerID]['R'] = player_stats[playerID]['R'] + R
        player_stats[playerID]['H'] = player_stats[playerID]['H'] + H
        player_stats[playerID]['2B'] = player_stats[playerID]['2B'] + twoB
        player_stats[playerID]['3B'] = player_stats[playerID]['3B'] + threeB
        player_stats[playerID]['HR'] = player_stats[playerID]['HR'] + HR
        player_stats[playerID]['RBI'] = player_stats[playerID]['RBI'] + RBI
        player_stats[playerID]['SB'] = player_stats[playerID]['SB'] + SB
        player_stats[playerID]['CS'] = player_stats[playerID]['CS'] + CS
        player_stats[playerID]['BB'] = player_stats[playerID]['BB'] + BB
        player_stats[playerID]['SO'] = player_stats[playerID]['SO'] + SO
        player_stats[playerID]['IBB'] = player_stats[playerID]['IBB'] + IBB
        player_stats[playerID]['HBP'] = player_stats[playerID]['HBP'] + HBP
        player_stats[playerID]['SH'] = player_stats[playerID]['SH'] + SH
        player_stats[playerID]['GIDP'] = player_stats[playerID]['GIDP'] + GIDP
    else:
        player_stats[playerID] = {}
        player_stats[playerID]['G'] = G
        player_stats[playerID]['AB'] = AB
        player_stats[playerID]['R'] = R
        player_stats[playerID]['H'] = H
        player_stats[playerID]['2B'] = twoB
        player_stats[playerID]['3B'] = threeB
        player_stats[playerID]['HR'] = HR
        player_stats[playerID]['RBI'] = RBI
        player_stats[playerID]['SB'] = SB
        player_stats[playerID]['CS'] = CS
        player_stats[playerID]['BB'] = BB
        player_stats[playerID]['SO'] = SO
        player_stats[playerID]['IBB'] = IBB
        player_stats[playerID]['HBP'] = HBP
        player_stats[playerID]['SH'] = SH
        player_stats[playerID]['GIDP'] = GIDP        


In [83]:
pitcher_list = []
for i, row in pitching_df.iterrows():
    playerID = row['playerID']
    Wp = row['W']
    Lp = row['L']
    Gp = row['G']
    GSp = row['GS']
    CGp = row['CG']
    SHOp = row['SHO']
    SVp = row['SV']
    IPOutsp = row['IPOuts']
    Hp = row['H']
    ERp = row['ER']
    HRp = row['HR']
    BBp = row['BB']
    SOp = row['SO']
    IBBp = row['IBB']
    WPp = row['WP']
    HBPp = row['HBP']
    BKp = row['BK']
    BFPp = row['BFP']
    GFp = row['GF']
    Rp = row['R']
    SHp = row['SH']
    SFp = row['SF']
    GIDPp = row['GIDP']
    if playerID in player_stats and playerID in pitcher_list:
        player_stats[playerID]['Wp'] = player_stats[playerID]['Wp'] + Wp
        player_stats[playerID]['Lp'] = player_stats[playerID]['Lp'] + Lp
        player_stats[playerID]['Gp'] = player_stats[playerID]['Gp'] + Gp
        player_stats[playerID]['GSp'] = player_stats[playerID]['GSp'] + GSp
        player_stats[playerID]['CGp'] = player_stats[playerID]['CGp'] + CGp
        player_stats[playerID]['SHOp'] = player_stats[playerID]['SHOp'] + SHOp
        player_stats[playerID]['SVp'] = player_stats[playerID]['SVp'] + SVp
        player_stats[playerID]['IPOutsp'] = player_stats[playerID]['IPOutsp'] + IPOutsp
        player_stats[playerID]['Hp'] = player_stats[playerID]['Hp'] + Hp
        player_stats[playerID]['ERp'] = player_stats[playerID]['ERp'] + ERp
        player_stats[playerID]['HRp'] = player_stats[playerID]['HRp'] + HRp
        player_stats[playerID]['BBp'] = player_stats[playerID]['BBp'] + BBp
        player_stats[playerID]['SOp'] = player_stats[playerID]['SOp'] + SOp
        player_stats[playerID]['IBBp'] = player_stats[playerID]['IBBp'] + IBBp
        player_stats[playerID]['WPp'] = player_stats[playerID]['WPp'] + WPp
        player_stats[playerID]['HBPp'] = player_stats[playerID]['HBPp'] + HBPp
        player_stats[playerID]['BKp'] = player_stats[playerID]['BKp'] + BKp
        player_stats[playerID]['BFPp'] = player_stats[playerID]['BFPp'] + BFPp
        player_stats[playerID]['GFp'] = player_stats[playerID]['GFp'] + GFp
        player_stats[playerID]['Rp'] = player_stats[playerID]['Rp'] + Rp
        player_stats[playerID]['SHp'] = player_stats[playerID]['SHp'] + SHp
        player_stats[playerID]['SFp'] = player_stats[playerID]['SFp'] + SFp
        player_stats[playerID]['GIDPp'] = player_stats[playerID]['GIDPp'] + GIDPp
    else:
        pitcher_list.append(playerID)
        player_stats[playerID]['Wp'] = Wp
        player_stats[playerID]['Lp'] = Lp
        player_stats[playerID]['Gp'] = Gp
        player_stats[playerID]['GSp'] = GSp
        player_stats[playerID]['CGp'] = CGp
        player_stats[playerID]['SHOp'] = SHOp
        player_stats[playerID]['SVp'] = SVp
        player_stats[playerID]['IPOutsp'] = IPOutsp
        player_stats[playerID]['Hp'] = Hp
        player_stats[playerID]['ERp'] = ERp
        player_stats[playerID]['HRp'] = HRp
        player_stats[playerID]['BBp'] = BBp
        player_stats[playerID]['SOp'] = SOp
        player_stats[playerID]['IBBp'] = IBBp
        player_stats[playerID]['WPp'] = WPp
        player_stats[playerID]['HBPp'] = HBPp
        player_stats[playerID]['BKp'] = BKp
        player_stats[playerID]['BFPp'] = BFPp
        player_stats[playerID]['GFp'] = GFp
        player_stats[playerID]['Rp'] = Rp
        player_stats[playerID]['SHp'] = SHp
        player_stats[playerID]['SFp'] = SFp
        player_stats[playerID]['GIDPp'] = GIDPp

        

In [84]:
fielder_list = []
for i, row in fielding_df.iterrows():
    playerID = row['playerID']
    Gf = row['G']
    GSf = row['GS']
    InnOutsf = row['InnOuts']
    POf = row['PO']
    Af = row['A']
    Ef = row['E']
    DPf = row['DP']
    PBfc = row['PB']
    WPfc = row['WP']
    SBfc = row['SB']
    CSfc = row['CS']
    if playerID in player_stats and playerID in fielder_list:
        player_stats[playerID]['Gf'] = player_stats[playerID]['Gf'] + Gf
        player_stats[playerID]['GSf'] = player_stats[playerID]['GSf'] + GSf
        player_stats[playerID]['InnOutsf'] = player_stats[playerID]['InnOutsf'] + InnOutsf
        player_stats[playerID]['POf'] = player_stats[playerID]['POf'] + POf
        player_stats[playerID]['Af'] = player_stats[playerID]['Af'] + Af
        player_stats[playerID]['Ef'] = player_stats[playerID]['Ef'] + Ef
        player_stats[playerID]['DPf'] = player_stats[playerID]['DPf'] + DPf
        player_stats[playerID]['PBfc'] = player_stats[playerID]['PBfc'] + PBfc
        player_stats[playerID]['WPfc'] = player_stats[playerID]['WPfc'] + WPfc
        player_stats[playerID]['SBfc'] = player_stats[playerID]['SBfc'] + SBfc
        player_stats[playerID]['CSfc'] = player_stats[playerID]['CSfc'] + CSfc
    else:
        fielder_list.append(playerID)
        player_stats[playerID]['Gf'] = Gf
        player_stats[playerID]['GSf'] = GSf
        player_stats[playerID]['InnOutsf'] = InnOutsf
        player_stats[playerID]['POf'] = POf
        player_stats[playerID]['Af'] = Af
        player_stats[playerID]['Ef'] = Ef
        player_stats[playerID]['DPf'] = DPf
        player_stats[playerID]['PBfc'] = PBfc
        player_stats[playerID]['WPfc'] = WPfc
        player_stats[playerID]['SBfc'] = SBfc
        player_stats[playerID]['CSfc'] = CSfc

In [85]:
print(awards_df['awardID'].unique())

['Pitching Triple Crown' 'Triple Crown' 'Baseball Magazine All-Star'
 'Most Valuable Player' 'TSN All-Star' 'TSN Guide MVP'
 'TSN Major League Player of the Year' 'TSN Pitcher of the Year'
 'TSN Player of the Year' 'Rookie of the Year' 'Babe Ruth Award'
 'Lou Gehrig Memorial Award' 'World Series MVP' 'Cy Young Award'
 'Gold Glove' 'TSN Fireman of the Year' 'All-Star Game MVP' 'Hutch Award'
 'Roberto Clemente Award' 'Rolaids Relief Man Award' 'NLCS MVP' 'ALCS MVP'
 'Silver Slugger' 'Branch Rickey Award' 'Hank Aaron Award'
 'TSN Reliever of the Year' 'Comeback Player of the Year'
 'Outstanding DH Award' 'Reliever of the Year Award']


In [86]:
mvp_df = awards_df[awards_df['awardID'] == 'Most Valuable Player']
roy_df = awards_df[awards_df['awardID'] == 'Rookie of the Year']
cy_df = awards_df[awards_df['awardID'] == 'Cy Young Award']
gg_df = awards_df[awards_df['awardID'] == 'Gold Glove']
ss_df = awards_df[awards_df['awardID'] == 'Silver Slugger']

awards_list = [mvp_df,roy_df,cy_df,gg_df,ss_df]

In [87]:
print(awards_list[1].head())

       playerID             awardID  yearID
1796  robinja02  Rookie of the Year    1947
1845   darkal01  Rookie of the Year    1948
1899  sievero01  Rookie of the Year    1949
1900  newcodo01  Rookie of the Year    1949
1953  dropowa01  Rookie of the Year    1950


In [88]:
mvp_list = []
roy_list = []
cy_list = []
gg_list = []
ss_list = []
lists = [mvp_list,roy_list,cy_list,gg_list,ss_list]

In [89]:
for index, v in enumerate(awards_list):
    for i, row in v.iterrows():
        playerID = row['playerID']
        award = row['awardID']
        if playerID in player_stats and playerID in lists[index]:
            player_stats[playerID][award] += 1
        else:
            lists[index].append(playerID)
            player_stats[playerID][award] = 1

In [90]:
allstar_list = []
for i, row in allstar_df.iterrows():
    playerID = row['playerID']
    if playerID in player_stats and playerID in allstar_list:
        player_stats[playerID]['AS_games'] += 1
    else:
        allstar_list.append(playerID)
        player_stats[playerID]['AS_games'] = 1

In [91]:
for i, row in hof_df.iterrows():
    playerID = row['playerID']
    if playerID in player_stats:
        player_stats[playerID]['HoF'] = 1

In [92]:
print(len(hof_df))
print(len(master_df))

250
19105


In [93]:
print(dict(list(player_stats.items())[0:2]))

{'oconnbr01': {'RBI': 0.0, 'Ef': 0.0, '3B': 0, 'Gf': 6, 'AB': 2, 'HRp': 2, 'Wp': 0, 'BB': 0, 'IPOutsp': 37, '2B': 0, 'BKp': 0, 'GFp': 2.0, 'GSf': 1.0, 'InnOutsf': 37.0, 'SB': 0.0, 'SH': 0.0, 'SBfc': nan, 'SOp': 7, 'POf': 0, 'CSfc': nan, 'DPf': 0.0, 'HR': 0, 'CS': 0.0, 'Lp': 0, 'SVp': 0, 'SHOp': 0, 'SO': 1.0, 'WPp': 4.0, 'GSp': 1, 'Gp': 6, 'GIDP': 0.0, 'IBBp': 0.0, 'SHp': 1.0, 'HBPp': 1.0, 'Af': 0.0, 'SFp': 1.0, 'HBP': 0.0, 'BBp': 11, 'R': 0, 'Rp': 11, 'CGp': 0, 'H': 1, 'GIDPp': 0.0, 'IBB': 0.0, 'G': 6, 'Hp': 12, 'PBfc': nan, 'ERp': 7, 'BFPp': 62.0, 'WPfc': nan}, 'otanewi01': {'RBI': 24.0, 'SO': 48.0, 'Ef': 6.0, '3B': 0, 'Gf': 66, 'GSf': 51.0, 'AB': 212, '2B': 11, 'PBfc': nan, 'GIDP': 6.0, 'CSfc': nan, 'BB': 15, 'Af': 60.0, 'WPfc': nan, 'HBP': 2.0, 'G': 74, 'R': 28, 'H': 50, 'InnOutsf': 1383.0, 'SB': 0.0, 'SH': 1.0, 'SBfc': nan, 'IBB': 0.0, 'POf': 128, 'DPf': 19.0, 'HR': 7, 'CS': 0.0}}


In [94]:
stats_df = pd.DataFrame.from_dict(player_stats, orient='index')

In [95]:
stats_df['playerID'] = stats_df.index

In [96]:
print(stats_df.head())

              RBI     Ef  3B      Gf     AB   HRp    Wp    BB  IPOutsp   2B  \
aardsda01     0.0    3.0   0   331.0      4  41.0  16.0     0   1011.0    0   
aaronha01  2297.0  144.0  98  3020.0  12364   NaN   NaN  1402      NaN  624   
aaronto01    94.0   22.0   6   387.0    944   NaN   NaN    86      NaN   42   
aasedo01      0.0   13.0   0   448.0      5  89.0  66.0     0   3328.0    0   
abadan01      0.0    1.0   0     9.0     21   NaN   NaN     4      NaN    0   

             ...        BFPp  WPfc  HoF  Most Valuable Player  AS_games  \
aardsda01    ...      1475.0   NaN  NaN                   NaN       NaN   
aaronha01    ...         NaN   NaN  1.0                   1.0      25.0   
aaronto01    ...         NaN   NaN  NaN                   NaN       NaN   
aasedo01     ...      4730.0   NaN  NaN                   NaN       1.0   
abadan01     ...         NaN   NaN  NaN                   NaN       NaN   

           Rookie of the Year  Gold Glove  Silver Slugger  Cy Young Award 

In [97]:
print(master_df.head())

    playerID nameFirst nameLast bats throws       debut   finalGame
0  aardsda01     David  Aardsma    R      R  2004-04-06  2015-08-23
1  aaronha01      Hank    Aaron    R      R  1954-04-13  1976-10-03
2  aaronto01    Tommie    Aaron    R      R  1962-04-10  1971-09-26
3   aasedo01       Don     Aase    R      R  1977-07-26  1990-10-03
4   abadan01      Andy     Abad    L      L  2001-09-10  2006-04-13


In [98]:
df = master_df.join(stats_df,on='playerID',how='inner',rsuffix='mstr')

In [99]:
print(df.head())
print(len(stats_df))
print(len(df))
print(len(master_df))

    playerID nameFirst nameLast bats throws       debut   finalGame     RBI  \
0  aardsda01     David  Aardsma    R      R  2004-04-06  2015-08-23     0.0   
1  aaronha01      Hank    Aaron    R      R  1954-04-13  1976-10-03  2297.0   
2  aaronto01    Tommie    Aaron    R      R  1962-04-10  1971-09-26    94.0   
3   aasedo01       Don     Aase    R      R  1977-07-26  1990-10-03     0.0   
4   abadan01      Andy     Abad    L      L  2001-09-10  2006-04-13     0.0   

      Ef  3B      ...         BFPp  WPfc  HoF  Most Valuable Player  AS_games  \
0    3.0   0      ...       1475.0   NaN  NaN                   NaN       NaN   
1  144.0  98      ...          NaN   NaN  1.0                   1.0      25.0   
2   22.0   6      ...          NaN   NaN  NaN                   NaN       NaN   
3   13.0   0      ...       4730.0   NaN  NaN                   NaN       1.0   
4    1.0   0      ...          NaN   NaN  NaN                   NaN       NaN   

   Rookie of the Year  Gold Glove  Sil

In [100]:
def bats_throws(col):
    if col == "R":
        return 1
    else:
        return 0
        
df['bats_R'] = df['bats'].apply(bats_throws)
df['throws_R'] = df['throws'].apply(bats_throws)

In [101]:
pos_list = []
pos_dict = {}
for i, row in fielding_df.iterrows():
    playerID = row['playerID']
    games = row['G']
    pos = row['Pos']
    if playerID in pos_dict:
        if pos in pos_dict[playerID]:
            pos_dict[playerID][pos] = pos_dict[playerID][pos] + games
        else:
            pos_dict[playerID][pos] = games
    else:
        pos_dict[playerID] = {}
        pos_dict[playerID][pos] = games

In [102]:
print(len(player_stats))

18915


In [103]:
primary_pos_dict = {}
player_list = []
for k, v in pos_dict.items():
    playerID = k
    primary_pos_dict[playerID] = {}
    for key, val in v.items():
        primary_pos_dict[playerID]['game_count'] = 0
        if val > primary_pos_dict[playerID]['game_count']:
            primary_pos_dict[playerID]['pos'] = key
            primary_pos_dict[playerID]['game_count'] = val

In [104]:
print(len(primary_pos_dict))
print(primary_pos_dict)

18714
{'oconnbr01': {'game_count': 6, 'pos': 'P'}, 'otanewi01': {'game_count': 2, 'pos': 'OF'}, 'stratas01': {'game_count': 1, 'pos': 'SS'}, 'munozbo01': {'game_count': 100, 'pos': 'P'}, 'villajo01': {'game_count': 7, 'pos': 'OF'}, 'wrighwe01': {'game_count': 2, 'pos': 'OF'}, 'moragda01': {'game_count': 4, 'pos': 'P'}, 'beattbl01': {'game_count': 7, 'pos': 'P'}, 'solaito01': {'game_count': 1, 'pos': 'OF'}, 'barthjo01': {'game_count': 4, 'pos': 'P'}, 'lombaer01': {'game_count': 1544, 'pos': 'C'}, 'breweto02': {'game_count': 10, 'pos': 'OF'}, 'cornena01': {'game_count': 56, 'pos': 'P'}, 'casseja01': {'game_count': 15, 'pos': 'P'}, 'kellyji01': {'game_count': 190, 'pos': 'OF'}, 'arredjo01': {'game_count': 214, 'pos': 'P'}, 'walkemi02': {'game_count': 5, 'pos': 'P'}, 'lopezmi01': {'game_count': 3, 'pos': '2B'}, 'barclcu01': {'game_count': 44, 'pos': 'P'}, 'alexaga01': {'game_count': 19, 'pos': 'OF'}, 'greerke01': {'game_count': 9, 'pos': 'P'}, 'conneto01': {'game_count': 3, 'pos': 'OF'}, '

In [105]:
primary_pos_df = pd.DataFrame.from_dict(primary_pos_dict, orient='index')

In [106]:
primary_pos_df = primary_pos_df.drop('game_count', axis=1)

In [107]:
df = df.join(primary_pos_df,on='playerID',how='inner')

In [108]:
dummy_df = pd.get_dummies(df['pos'], prefix='pos_')
df = pd.concat([df, dummy_df], axis=1)

In [109]:
from datetime import datetime
df['debut'] =  pd.to_datetime(df['debut'])
df['finalGame'] = pd.to_datetime(df['finalGame'])

In [110]:
df['debutYear'] = pd.to_numeric(df['debut'].dt.strftime('%Y'), errors='coerce')
df['finalYear'] = pd.to_numeric(df['finalGame'].dt.strftime('%Y'), errors='coerce')

In [111]:
df['YSR'] = 2016 - df['finalYear']

In [112]:
df = df[df['YSR'] > 15]

In [113]:
df_hof = df[df['HoF'] == 1]
print(df_hof)
print(len(df_hof))

        playerID nameFirst     nameLast bats throws      debut  finalGame  \
1      aaronha01      Hank        Aaron    R      R 1954-04-13 1976-10-03   
177    alexape01      Pete    Alexander    R      R 1911-04-15 1930-05-28   
389    ansonca01       Cap        Anson    R      R 1871-05-06 1897-10-03   
398    aparilu01      Luis     Aparicio    R      R 1956-04-17 1973-09-28   
405    applilu01      Luke      Appling    R      R 1930-09-10 1950-10-01   
480    ashburi01    Richie      Ashburn    L      R 1948-04-20 1962-09-30   
541    averiea01      Earl      Averill    L      R 1929-04-16 1941-04-25   
642    bakerfr01  Home Run        Baker    L      R 1908-09-21 1922-09-29   
702    bancrda01      Dave     Bancroft    B      R 1915-04-14 1930-05-31   
714    bankser01     Ernie        Banks    R      R 1953-09-17 1971-09-26   
1028   becklja01      Jake      Beckley    L      L 1888-06-20 1907-06-15   
1133   benchjo01    Johnny        Bench    R      R 1967-08-28 1983-09-29   

In [114]:
#Creating HoF Eligibility Column
def eligibility(y):
    if y <= 15:
        return 1
    else:
        return 0
    
df['stillEligible'] = df['YSR'].apply(eligibility)

In [115]:
print(df['stillEligible'].head())

1    0
2    0
3    0
6    0
7    0
Name: stillEligible, dtype: int64


In [116]:
df['AVE'] = df['H'] / df['AB']

In [117]:
df['ERA'] = df['ERp'] / (df['IPOutsp'] / 27)

In [118]:
df_hitters = df[df['pos__P'] == 0]

In [119]:
print(df['ERA'].head())

1         NaN
2         NaN
3    3.796875
6         NaN
7         NaN
Name: ERA, dtype: float64


In [120]:
print(df.columns)

Index(['playerID', 'nameFirst', 'nameLast', 'bats', 'throws', 'debut',
       'finalGame', 'RBI', 'Ef', '3B', 'Gf', 'AB', 'HRp', 'Wp', 'BB',
       'IPOutsp', '2B', 'BKp', 'GFp', 'GSf', 'InnOutsf', 'SB', 'SH', 'SBfc',
       'SOp', 'POf', 'CSfc', 'DPf', 'HR', 'CS', 'Lp', 'SVp', 'SHOp', 'SO',
       'WPp', 'GSp', 'Gp', 'GIDP', 'IBBp', 'SHp', 'HBPp', 'Af', 'SFp', 'HBP',
       'BBp', 'R', 'Rp', 'CGp', 'H', 'GIDPp', 'IBB', 'G', 'Hp', 'PBfc', 'ERp',
       'BFPp', 'WPfc', 'HoF', 'Most Valuable Player', 'AS_games',
       'Rookie of the Year', 'Gold Glove', 'Silver Slugger', 'Cy Young Award',
       'playerIDmstr', 'bats_R', 'throws_R', 'pos', 'pos__1B', 'pos__2B',
       'pos__3B', 'pos__C', 'pos__OF', 'pos__P', 'pos__SS', 'debutYear',
       'finalYear', 'YSR', 'stillEligible', 'AVE', 'ERA'],
      dtype='object')


In [121]:
numeric_cols = ['Rp', 'SVp', 'RBI', 'BFPp', 'HRp', 'SHOp', 'CSfc', 'SB', 'GSp', 'CS', 'Ef', 'R', 'IPOutsp',
       'PBfc', 'BB', 'SBfc', 'Af', '2B', '3B', 'SO',
       'HR', 'CGp', 'WPfc', 'Lp', 'G', 'IBB',
       'H', 'SOp', 'AB', 'ERp', 'Wp', 'AS_games', 'HoF', 'Gold Glove', 'Rookie of the Year',
       'Silver Slugger', 'Most Valuable Player', 'Cy Young Award', 'bats_R',
       'throws_R','pos__1B', 'pos__2B',
       'pos__3B', 'pos__C', 'pos__OF', 'pos__P', 'pos__SS', 'YSR', 'AVE']
data = df[numeric_cols]

In [122]:
numeric_cols1 = ['Rp', 'SVp', 'GIDP', 'RBI', 'IBBp', 'BFPp', 'InnOutsf',
       'HRp', 'SHOp', 'CSfc', 'SB', 'GSp', 'CS', 'Ef', 'R', 'BKp', 'IPOutsp',
       'WPp', 'PBfc', 'BB', 'SBfc', 'Af', 'HBPp', '2B', '3B', 'SFp', 'SO',
       'HR', 'DPf', 'POf', 'Gf', 'CGp', 'WPfc', 'Lp', 'Gp', 'SH', 'G', 'IBB',
       'H', 'SOp', 'AB', 'ERp', 'Hp', 'GSf', 'SHp', 'HBP', 'BBp', 'GIDPp',
       'Wp', 'GFp', 'AS_games', 'HoF', 'Gold Glove', 'Rookie of the Year',
       'Silver Slugger', 'Most Valuable Player', 'Cy Young Award', 'bats_R',
       'throws_R','pos__1B', 'pos__2B',
       'pos__3B', 'pos__C', 'pos__OF', 'pos__P', 'pos__SS', 'YSR', 'AVE']

In [124]:
num_cols_hitters = ['SB','CS','R','BB','SBfc', 'Af', '2B', '3B', 'SO','HR',
       'WPfc','G', 'IBB', 'H', 'AB', 'AS_games','Gold Glove', 'Rookie of the Year', 
       'Silver Slugger', 'Most Valuable Player','bats_R',
       'throws_R','pos__1B', 'pos__2B',
       'pos__3B', 'pos__C', 'pos__OF', 'pos__SS', 'YSR', 'AVE', 'HoF']
data = df_hitters[num_cols_hitters]

In [125]:
data = data.fillna(0)

In [148]:
data_hof = data[data['HoF'] == 1]

In [149]:
print(data_hof.head())
print(len(data_hof))

        SB     CS     R    BB  SBfc      Af   2B   3B      SO   HR ...   \
1    240.0   73.0  2174  1402   0.0   429.0  624   98  1383.0  755 ...    
177    3.0    0.0   154    77   0.0  1419.0   60   13   276.0   11 ...    
398  506.0  136.0  1335   736   0.0  8016.0  394   92   742.0   83 ...    
405  179.0  108.0  1319  1302   0.0  7543.0  440  102   528.0   45 ...    
480  234.0    0.0  1322  1198   0.0   182.0  317  109   571.0   29 ...    

     throws_R  pos__1B  pos__2B  pos__3B  pos__C  pos__OF  pos__SS   YSR  \
1           1      0.0      0.0      0.0     0.0      1.0      0.0  40.0   
177         1      0.0      0.0      0.0     0.0      1.0      0.0  86.0   
398         1      0.0      0.0      0.0     0.0      0.0      1.0  43.0   
405         1      1.0      0.0      0.0     0.0      0.0      0.0  66.0   
480         1      0.0      0.0      0.0     0.0      1.0      0.0  54.0   

          AVE  HoF  
1    0.304998  1.0  
177  0.208840  1.0  
398  0.261681  1.0  
405  0.3

In [143]:
# Split data DataFrame into train and test sets
train = data.sample(frac=0.60, random_state=1)
test = data.loc[~data.index.isin(train.index)]

In [128]:
target = data['HoF']
features = data.drop('HoF', axis=1)

In [144]:
train_target_t = train['HoF']
train_features_t = train.drop('HoF', axis=1)
test_target_t = test['HoF']
test_features_t = test.drop('HoF', axis=1)

## Don't forget to deal with class imbalance.

In [130]:
from sklearn.cross_validation import cross_val_predict, KFold
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight='balanced')
kf = KFold(features.shape[0], random_state=1)

predictions = cross_val_predict(lr, features, target, cv=kf)
predictions = pd.Series(predictions)

In [135]:
len(train_target_t)

6230

In [145]:
lr.fit(train_features_t, train_target_t)
test_predictions = lr.predict(test_features_t)

In [146]:
test_predictions = pd.Series(test_predictions)

tn_filter = (test_predictions == 0) & (test_target_t == 0)
tn = len(test_predictions[tn_filter])

tp_filter = (test_predictions == 1) & (test_target_t == 1)
tp = len(test_predictions[tp_filter])

fn_filter = (test_predictions == 0) & (test_target_t == 1)
fn = len(test_predictions[fn_filter])

fp_filter = (test_predictions == 1) & (test_target_t == 0)
fp = len(test_predictions[fp_filter])

In [133]:
tn_filter = (predictions == 0) & (data['HoF'] == 0)
tn = len(predictions[tn_filter])

tp_filter = (predictions == 1) & (data['HoF'] == 1)
tp = len(predictions[tp_filter])

fn_filter = (predictions == 0) & (data['HoF'] == 1)
fn = len(predictions[fn_filter])

fp_filter = (predictions == 1) & (data['HoF'] == 0)
fp = len(predictions[fp_filter])

In [147]:
print(tn)
print(tp)
print(fn)
print(fp)
fpr = fp / (fp + tn)
tpr = tp / (tp + fn)
print(fpr)
print(tpr)

519
2
14
44
0.07815275310834814
0.125


In [None]:
print(len(target))
print(len(df))
print(len(predictions))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=1, class_weight='balanced')
predictions = cross_val_predict(rf, features, target, cv=kf)
predictions= pd.Series(predictions)

In [None]:
tn_filter = (predictions == 0) & (data['HoF'] == 0)
tn = len(predictions[tn_filter])

tp_filter = (predictions == 1) & (data['HoF'] == 1)
tp = len(predictions[tp_filter])

fn_filter = (predictions == 0) & (data['HoF'] == 1)
fn = len(predictions[fn_filter])

fp_filter = (predictions == 1) & (data['HoF'] == 0)
fp = len(predictions[fp_filter])

In [None]:
print(tn)
print(tp)
print(fn)
print(fp)

In [None]:
fpr = fp / (fp + tn)
tpr = tp / (tp + fn)

In [None]:
print(fpr)
print(tpr)

In [None]:
print(predictions)