# Predicting Baseball Game Outcomes

### Load in packages

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import collections as c
%matplotlib inline
import itertools
from collections import Counter
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()

### Data Cleaning

In [2]:
# Make sure duplicates do not exist
with open('atbat_table.csv','r') as in_file, open('edited_atbat_table.csv','w') as out_file:
    seen = set()
    for line in in_file:
        if line in seen: continue 

        seen.add(line)
        out_file.write(line)

with open('pitch_table.csv','r') as in_file, open('edited_pitch_table.csv','w') as out_file:
    seen = set()
    for line in in_file:
        if line in seen: continue 

        seen.add(line)
        out_file.write(line)

In [45]:
# LOAD IN DATA FROM CSV
bat = pd.read_csv('edited_atbat_table.csv')
pitch = pd.read_csv('edited_pitch_table.csv')
bat.head(4)
pitch.head(4)

Unnamed: 0,retro_game_id,year,st_fl,regseason_fl,playoffs_fl,game_type,game_type_des,game_id,home_team_id,home_team_lg,...,break_y,break_angle,break_length,pitch_type,pitch_type_seq,type_conf,zone,spin_dir,spin_rate,sv_id
0,OAK201509010,2015,F,T,F,R,Regular Season,415615,oak,AL,...,23.8,1.8,4.4,FF,FF,0.909,5,184.549,1674.376,150901_190703
1,OAK201509010,2015,F,T,F,R,Regular Season,415615,oak,AL,...,23.9,16.4,4.3,FF,FF,0.909,13,199.334,1930.502,150901_190744
2,OAK201509010,2015,F,T,F,R,Regular Season,415615,oak,AL,...,23.8,-5.0,6.2,FC,FF|FC,0.761,1,168.632,961.722,150901_190756
3,OAK201509010,2015,F,T,F,R,Regular Season,415615,oak,AL,...,23.9,-9.5,6.2,FC,FF|FC|FC,0.659,14,158.747,1026.539,150901_190820


### Data Munging

In [46]:
# Create new bases column to signify number of bases attained during PA (similar to SLG)
def slugging(i):
    if i in ['Single']:
        return 1
    elif i == 'Double':
        return 2
    elif i == 'Triple':
        return 3    
    else:
        return 0            
bat['SLG'] = bat.event_tx.apply(slugging) 

# Create new on_base column to signify player's ability to get on base (similar to OBP)
nullplay = ['Hit By Pitch','Forceout','Sacrifice Bunt DP','Runner Out','Batter Interference','Catcher Interference','Sac Bunt','Fan interference']
out = ['Flyout','Lineout','Groundout','Sac Fly','Strikeout','Grounded Into DP','Fielders Choice Out','Pop Out','Bunt Groundout','Field Error','Triple Play','Bunt Pop Out','Double Play','Strikeout - DP','Fielders Choice','Sac Fly DP','Bunt Lineout']
on_base = ['Walk','Single','Double','Triple','Home Run','Intent Walk']
bat_perf = bat[~bat.event_tx.isin(nullplay)]

def hit_col(i):
    if i in on_base:
        return 1
    elif i in out:
        return 0
    else:
        print 'Missing Field:',i
            
bat_perf['OBP'] = bat_perf.event_tx.apply(hit_col) 

# Drop NA Rows in pitch data
pitch = pitch[pd.notnull(pitch['pitch_type'])]
# Drops rows w/ insignificant pitch types (pitch_out, eephus, intent, etc.)
bad_pitches = ['AB','EP','UN','IN']
pitch = pitch[~pitch.pitch_type.isin(bad_pitches)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [50]:
# Create aggrgated date column
bat_perf['date'] = pd.to_datetime((bat_perf.year*10000+bat_perf.month*100+bat_perf.day).apply(str),format='%Y%m%d')

In [59]:
batter_list = [112526,120074,121347,116338]
lineup = bat_perf[bat_perf['bat_mlbid'].isin(batter_list)]
lineup_performance = lineup.groupby('bat_mlbid')[['OBP','SLG']].mean()
lineup_performance.mean()

OBP    0.285834
SLG    0.192691
dtype: float64

In [60]:
pitch_id = 592533 
pitcher = bat_perf[bat_perf['pit_mlbid'] == pitch_id]
pitcher_performance = pitcher.groupby('pit_mlbid')[['OBP','SLG']].mean()
pitcher_performance.mean()

OBP    0.473684
SLG    0.403509
dtype: float64