#Building Pitcher Class 

In [1]:
#Import necessary packages
import psycopg2
import sys  
sys.path.append('..')

import numpy as np
import pandas as pd

from config import REDSHIFT_CONFIG
from src.features import *
from src.utils import *
from src.validation import *

reload(sys)
sys.setdefaultencoding('utf8')

from src.exploration import *

In [2]:
# Establish a connection to the redshift database
conn = create_rs_conn(config=REDSHIFT_CONFIG)
cur = conn.cursor()

In [3]:
from datetime import date

class Pitcher:
    '''Master class to contain all info/methods related to a pitcher's pitch data'''
    
    def __init__(self, pitcher_id, redshift_cursor, table = None):
        '''Stores:
            -a database cursor (self.cur)
            -pitcher ID (self.pitcher_id)
            -all pitcher data (self.data) from 'table' (all_pitch_data by default)
        '''
        
        #Store the pitcher's id
        self.pitcher_id = pitcher_id
        
        #Using a passed-in cursor instead of creating it here, so we don't have to open new connection
        #for each pitcher we create
        self.cur = redshift_cursor
        
        #Use exception handling in case we get a shutdown of the connection
        try:
            #Choose which table to pull data from
            if table is None:
                self.data = get_pitcher_df_for_modeling(self.cur, 
                                                        pitcher_id = pitcher_id, 
                                                        date_subsetting = False)
            else:
                self.data = get_pitcher_df_for_modeling(self.cur, 
                                                        pitcher_id = pitcher_id, 
                                                        date_subsetting = False,
                                                        table = table)
        except:
            # Re-establish a connection to the redshift database
            conn = create_rs_conn(config=REDSHIFT_CONFIG)
            self.cur = conn.cursor()
            
            #Choose which table to pull data from
            if table is None:
                self.data = get_pitcher_df_for_modeling(self.cur, 
                                                        pitcher_id = pitcher_id,
                                                        date_subsetting = False)
            else:
                self.data = get_pitcher_df_for_modeling(self.cur, 
                                                        pitcher_id = pitcher_id, 
                                                        date_subsetting = False,
                                                        table = table)
        
        #Get the pitcher's name
        self.cur.execute('''select p_first_name, p_last_name
                    from all_pitch_data
                    where pitcher = %d
                    limit 1''' % self.pitcher_id)
        self.name = " ".join(self.cur.fetchall()[0])
        
        #initiate a list that keeps track of methods called on the object (record-keeping)
        self.method_history = []    
            
    def find_optimal_date_splits(self):
        ''' '''
        self.method_history.append("split_test_train")
        
    def subset_data_by_date(self, max_date = None, min_date = '2008-01-01'):
        '''Subsets self.data based on max and min dates'''
        
        #Get a max_date if one not given
        if max_date is None:
            self.max_date = date.today().strftime('%Y-%m-%d')
        else:
            self.max_date = max_date
        
        #Store the minimum date
        self.min_date = min_date
        
        #Subset the data
        self.data = self.data[(self.data['date'] <= self.max_date) & (self.data['date'] >= self.min_date)]
        
        #Indicate that the method has been called
        self.method_history.append('subset_by_date')
        
    def subset_data_by_columns(self, cols):
        '''self.data is subset to only include "cols"'''
        self.data = self.data[cols]
        self.method_history.append("subset_by_columns")
    
    def split_test_train(self, quantile_split = 0.9, date_override = None):
        '''Splits self.data into testing and train data, creating a new dictionary containing all the
        test/train data and targets'''
        
        #Get the date on which to split test/train
        if date_override is None:
            split_date = str(self.data['date'].quantile(quantile_split))[:10]
            
        else:
            split_date = date_override
        
        #split the data and return a dictionary with test/train data/targets
        self.modeling_dict = split_test_train(self.data, split_date)
        
        #Log the transaction
        self.method_history.append("split_test_train_" + split_date)
        
        #Store the baseline accuracy
        self.baseline_accuracy = naive_accuracy(self.modeling_dict)
        
    def pitch_type_by_year(self):
        '''returns a pandas dataframe getting the count of the pitch types by year'''
        return get_pitch_types_by_year(self.data, use_gameday = False)
    
    def run_classifiers(self):
        '''runs four different classifiers and tries to ensemble them'''
        
        classifier_dict = run_all_classifiers(self.modeling_dict)
        all_predictions_dict = collect_classifier_predictions2(self.modeling_dict, classifier_dict)
        best_classifiers = choose_best_ensemble(all_predictions_dict, self.modeling_dict)
        
        #Handle cases where there's a single classifier chosen
        if type(best_classifiers['classifier_combination']) == str:
            
            single_class = best_classifiers['classifier_combination']
            self.classifiers = {single_class: classifier_dict[single_class]}
            
        else:
            
            self.classifiers = dict((k, classifier_dict[k]) for k in best_classifiers['classifier_combination'])
            
        self.best_acc = best_classifiers['best_acc']
        self.acc_over_most_common = self.best_acc - self.baseline_accuracy
        print 'classifiers used:', self.classifiers.keys()
        print 'best accuracy:', self.best_acc
        print "Accuracy above guessing most common:", self.acc_over_most_common
        
        #Log it
        self.method_history.append('ran_classifiers')
    
    def predict(self, new_data):
        '''runs best ensemble'''
        pred_dict = {}
        
        #Make predictions based for each classifier
        for classifier in self.classifiers.keys():

            # Make predictions on new data
            pred_dict[classifier] = self.classifier[classifier].predict(new_data)
            
        #Vote based on the predictions
        return ensemble_voting(pred_dict)
    
    def delete_all_data(self):
        '''Delete all the data (before serializing the object)'''
        
        del(self.data)
        del(self.modeling_dict)
        self.method_history.append('deleted all data')

In [259]:
test = Pitcher(pitcher_id = 448802, redshift_cursor = cur)

In [260]:
test.data.head()

Unnamed: 0,b,s,on_1b,on_2b,on_3b,pitch_type,o,stand,p_throws,home_wins,...,season_pitch_count,prev_pitches_mean_start_speed,prev_pitches_mean_end_speed,prev_pitches_mean_break_y,prev_pitches_mean_break_angle,prev_pitches_mean_break_length,last_pitch_type,second_last_pitch_type,third_last_pitch_type,cur_season
33469,0,1,1,0,0,Not_Fastball,0,R,L,43,...,4,90.1,83.6,23.8,-14.5,4.9,Fastball,Not_Fastball,Not_Fastball,0
33477,0,2,1,0,0,Not_Fastball,0,R,L,43,...,5,83.15,76.75,23.8,-7.25,10.05,Not_Fastball,Fastball,Not_Fastball,0
32475,1,2,1,0,0,Fastball,0,R,L,43,...,6,80.566667,74.033333,23.766667,-4.033333,11.7,Not_Fastball,Not_Fastball,Fastball,0
32497,2,2,1,0,0,Fastball,0,R,L,43,...,7,79.666667,73.2,23.766667,0.466667,12.0,Fastball,Not_Fastball,Not_Fastball,0
33470,2,3,1,0,0,Not_Fastball,0,R,L,43,...,8,83.433333,76.766667,23.766667,-0.4,8.866667,Fastball,Fastball,Not_Fastball,0


In [261]:
len(test.data)

10048

In [262]:
test.method_history

[]

In [263]:
test.subset_data_by_date(max_date = '2014-01-01')

In [264]:
len(test.data)

8775

In [265]:
test.method_history

['subset_by_date']

In [266]:
test.name

'Jaime Garcia'

In [267]:
test.pitch_type_by_year()

pitch_type,Fastball,Not_Fastball
2008,186,70
2010,1450,1139
2011,1860,1508
2012,1069,739
2013,378,376


In [268]:
test.split_test_train(quantile_split = 0.8)

In [269]:
test.modeling_dict['train_data'].head()

Unnamed: 0,b,s,on_1b,on_2b,on_3b,o,home_wins,home_loss,away_wins,away_loss,...,stadium_name_Wrigley Field,last_pitch_type_Fastball,last_pitch_type_Not_Fastball,last_pitch_type_not_available,second_last_pitch_type_Fastball,second_last_pitch_type_Not_Fastball,second_last_pitch_type_not_available,third_last_pitch_type_Fastball,third_last_pitch_type_Not_Fastball,third_last_pitch_type_not_available
33469,0,1,1,0,0,0,43,49,52,42,...,0,1,0,0,0,1,0,0,1,0
33477,0,2,1,0,0,0,43,49,52,42,...,0,0,1,0,1,0,0,0,1,0
32475,1,2,1,0,0,0,43,49,52,42,...,0,0,1,0,0,1,0,1,0,0
32497,2,2,1,0,0,0,43,49,52,42,...,0,1,0,0,0,1,0,0,1,0
33470,2,3,1,0,0,0,43,49,52,42,...,0,1,0,0,1,0,0,0,1,0


In [270]:
len(test.modeling_dict['test_data'])

1834

In [271]:
len(test.modeling_dict['train_data'])

6941

In [272]:
test.split_test_train(date_override='2013-01-01')

In [273]:
len(test.modeling_dict['test_data'])

754

In [274]:
len(test.modeling_dict['train_data'])

8021

In [275]:
test.baseline_accuracy

0.501

In [276]:
test.run_classifiers()

In [191]:
#Seeing how many active pitchers we'd get for these thresholds
for min_pitch_count in range(600, 3500, 100):    
    cur.execute('''SELECT pitcher, 
                                COUNT(*) as tot_pitch_count, 
                                MAX(date) as maximum_date
                        FROM all_pitch_data_reclass
                        GROUP BY pitcher
                        HAVING count(*) >= %d AND 
                                MAX(date) > '2015-01-01'
                        ORDER BY pitcher''' % min_pitch_count)

    rows = cur.fetchall()
    header = [colnames[0] for colnames in cur.description]
    pitcher_df = pd.DataFrame(rows)
    pitcher_df.columns = header
    print "Number of pitchers for pitch threshold of", min_pitch_count, ":", len(pitcher_df)

In [None]:
pitchers = randomly_sample_pitchers2(cur, 5, 3000, seed_num = 35)

In [None]:
pitcher_list = pitchers['pitcher'].values

In [26]:
accuracies = {}
erros = []
for pitcher in pitcher_list:
    
    try:
        print 'starting pitcher:', pitcher
        test = Pitcher(pitcher_id = pitcher, redshift_cursor = cur)
        #test.subset_data_by_date(max_date = '2014-01-01')
        print test.pitch_type_by_year()
        test.split_test_train()
        print 'baseline acc:', test.baseline_accuracy
        test.run_classifiers()
        accuracies[pitcher] = {}
        accuracies[pitcher]['improvement'] = test.acc_over_most_common
        accuracies[pitcher]['rel_improvement'] = test.acc_over_most_common / (1 - test.baseline_accuracy)
        accuracies[pitcher]['baseline'] = test.baseline_accuracy
        
    except:
        print "something went wrong with", pitcher
        errors.append(pitcher)

In [27]:
accuracies

{112526: {'baseline': 0.835,
  'improvement': 0.0059090909090909749,
  'rel_improvement': 0.03581267217630893},
 136600: {'baseline': 0.599,
  'improvement': 0.018433414043583563,
  'rel_improvement': 0.045968613575021348},
 150274: {'baseline': 0.561,
  'improvement': 0.087089171974522261,
  'rel_improvement': 0.19838080176428763},
 150302: {'baseline': 0.552,
  'improvement': 0.16257905544147844,
  'rel_improvement': 0.36289967732472866},
 276351: {'baseline': 0.672,
  'improvement': 0.011042789223454808,
  'rel_improvement': 0.033667040315411005},
 285064: {'baseline': 0.517,
  'improvement': 0.10544124336618654,
  'rel_improvement': 0.21830485168982722},
 329092: {'baseline': 0.678,
  'improvement': 0.044955145118733442,
  'rel_improvement': 0.13961225192153245},
 400089: {'baseline': 0.701,
  'improvement': 0.005766917293233087,
  'rel_improvement': 0.019287348806799619},
 407793: {'baseline': 0.859,
  'improvement': 0.003322390984811352,
  'rel_improvement': 0.023563056629867744}

In [3]:
import pickle
def serialize_pitcher(pitcher_class, fp):
    with open(fp, 'wb') as f:
        pickle.dump(pitcher_class, f)

In [4]:
def load_pitcher(fp):
    with open(fp, 'rb') as f:
        pitcher = pickle.load(f)
    return pitcher

## Getting 10 random pitchers' data for testing on plane

In [None]:
accuracies = {}
errors = []
pitcher_dict = {}
for pitcher in pitcher_list:
    
    try:
        print 'starting pitcher:', pitcher
        test = Pitcher(pitcher_id = pitcher, redshift_cursor = cur)
        #test.subset_data_by_date(max_date = '2014-01-01')
        print test.pitch_type_by_year()
        test.split_test_train()
        print 'baseline acc:', test.baseline_accuracy
        #test.run_classifiers()
        #accuracies[pitcher] = {}
        #accuracies[pitcher]['improvement'] = test.acc_over_most_common
        #accuracies[pitcher]['rel_improvement'] = test.acc_over_most_common / (1 - test.baseline_accuracy)
        #accuracies[pitcher]['baseline'] = test.baseline_accuracy

        pitcher_dict[pitcher] = test
        
    except:
        print 'something went wrong with', pitcher
        
#    except:
#        print "something went wrong with", pitcher
#        errors.append(pitcher)

### Serialize the randomly sampled pitcher dictionary

In [None]:
import pickle
with open(fp, 'wb') as f:
        pickle.dump(pitcher_class, f)

## Testing Model Calibration 

##Class Re-weighting 