Clean Up Batting Stats Table

In [1]:
import numpy as np
import pandas as pd

## set file location for files:
df_dir = 'c:/users/roke/documents/udacity/udacity nanodegree/project 2/baseball data tables/'

## read in the batting statistics data table
filename = 'batting.csv'
df_batting = pd.read_csv(df_dir + filename)

## modern rules for baseball began in 1901 season, so limit further analysis to
## that time frame

df_bat_modern = df_batting.query('yearID > 1900').reset_index(drop=True)

len(df_bat_modern)

91749

In [2]:
## Now there are lots of NaN's in the batting table early-on, and these appear to correspond to
## statistics which weren't recorded in the early years of professional baseball.  The ones I 
## intend to use are at-bats, hits, doubles, triples, and home runs, and those appear to be
## recorded throughout.  However, when I check to see if there are any NaN's in the 'AB' column,
## I find, somewhat surprisingly, there are a fair number of them.
df_bat_modern['AB'].isnull().sum()

5149

In [3]:
## Wondering where these are, select the rows with 'AB' isnull() and look at the yearID's
df_no_AB = df_bat_modern[df_bat_modern['AB'].isnull() == True]
df_no_AB['yearID'].unique()

array([1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983,
       1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994,
       1995, 1996, 1997, 1998, 1999], dtype=int64)

In [4]:
## So really they're in the middle where they wouldn't be seen by a scan of the beginning and ending
## parts of the table.  Good lesson there.
## So now drop all rows in the table where there's a NaN in any of the columns I plan to use
## for calculating statistics
df_bat_modern.dropna(subset=['AB','H','2B','3B','HR'],inplace = True)
len(df_bat_modern)

86600

In [5]:
## and check to see if there are any more surprises in the other columns I plan to use,
## yearID, playerID, stint, teamID, or lgID:
len(df_bat_modern.dropna(subset=['yearID','playerID','stint','teamID','lgID']))


86600

In [6]:
## So no more surprises (I think).
## The table appeared to have been sorted by yearID, playerID, stint,  but in case the above
## has messed this up:
df_bat_modern.sort_values(by=['yearID','playerID','stint'],inplace=True)
df_bat_modern.reset_index(drop=True,inplace=True)
df_bat_modern.tail(10)

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
86590,youngch04,2014,1,NYN,NL,88,254,31,52,12,...,28,7,3,25,54,2,4,1,3,3
86591,youngch04,2014,2,NYA,AL,23,71,9,20,8,...,10,1,0,7,16,0,1,0,0,0
86592,youngde03,2014,1,BAL,AL,83,242,27,73,11,...,30,2,0,10,51,0,3,0,0,6
86593,younger03,2014,1,NYN,NL,100,280,48,64,10,...,17,30,6,24,60,1,5,5,2,2
86594,zeidjo01,2014,1,HOU,AL,23,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
86595,zieglbr01,2014,1,ARI,NL,68,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
86596,zimmejo02,2014,1,WAS,NL,32,55,3,10,1,...,1,0,0,2,21,0,0,9,1,0
86597,zimmery01,2014,1,WAS,NL,61,214,26,60,19,...,38,0,0,22,37,0,0,0,4,6
86598,zobribe01,2014,1,TBA,AL,146,570,83,155,34,...,52,10,5,75,84,4,1,2,6,8
86599,zuninmi01,2014,1,SEA,AL,131,438,51,87,20,...,60,0,3,17,158,1,17,0,4,12


In [7]:
## Which is how the default integer index should look for a table of length 86600.  Moving on

In [8]:

## In the batting table there are multiple player/year combinations where a player has multiple
## rows, each representing a separate "stint", typically with a different club.  These need to be
## combined before calculating yearly stats like batting averages.

def combine_rows(first,second):
    return_row = first.copy()
    return_row['G'] += second['G']
    return_row['AB'] += second['AB']
    return_row['R'] += second['R']
    return_row['H'] += second['H']
    return_row['2B'] += second['2B']
    return_row['3B'] += second['3B']
    return_row['HR'] += second['HR']
    return_row['RBI'] += second['RBI']
    return_row['SB'] += second['SB']
    return_row['CS'] += second['CS']
    return_row['SO'] += second['SO']
    return_row['IBB'] += second['IBB']
    return_row['HBP'] += second['HBP']
    return_row['SH'] += second['SH']
    return_row['SF'] += second['SF']
    return_row['GIDP'] += second['GIDP']
    if second['G'] > first['G']:
        return_row['teamID'] = second['teamID']
        return_row['lgID'] = second['lgID']
    return return_row

max_stints = df_bat_modern['stint'].max()

## Starting with the highest numbered stint in the table, add appropriate rows to prior row
## in the table and delete the higher numbered row.  Update the teamID and lgID fields to
## represent the row with higer number of games. This code is inefficient due to the
## use of iterrows(), but I'm not sure how else to do it right now.

for i in range (max_stints, 1, -1):
    for idx,row in df_bat_modern[df_bat_modern['stint']==i].iterrows():
        try:
            df_bat_modern.loc[idx-1] = combine_rows(df_bat_modern.loc[idx-1],row)
            df_bat_modern.drop([idx],inplace=True)
        except KeyError: ## means lower numbered row already dropped, do nothing
            pass

print 'done'


done


In [9]:
## Now that stints for a year have been combined, eliminate rows with AB = 0.  These are now 
## rows in which a player has no official at-bats for an entire year.  
## Using these rows would result in a divide-by-zero condition when 
## calculating batting average and slugging percentage, resulting in NaN's in those
## cells.  Removing these rows should have no effect on the anlyses I'm doing, as a hitter with no
## official at-bats in a year would not be counted in overall league batting stats, and would not
## have an individual batting average or slugging percentage calculable in that year.
df_bat_modern = df_bat_modern[df_bat_modern['AB'] != 0]
len(df_bat_modern)

72411

In [10]:
## no need to keep the "stint" column anymore

df_bat_modern.drop('stint', axis =1, inplace = True)
df_bat_modern.head(2)


Unnamed: 0,playerID,yearID,teamID,lgID,G,AB,R,H,2B,3B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,anderjo01,1901,MLA,AL,138,576,90,190,46,7,...,99,35,,24,,,3,4,,
1,bakerbo01,1901,CLE,AL,2,7,0,1,0,0,...,1,0,,0,,,0,0,,


In [11]:
## Since the step to consolidate stints is inefficient, write out what we have so far so next step(s)
## will be less time-consuming.

df_bat_modern.to_csv(df_dir + 'Batting_Modern_TL_Fixed.csv',index=False)
