# Testing Whether subsetting by pitch_type_confidence helps accuracy 

In [1]:
#Import necessary packages
import psycopg2
import sys  
sys.path.append('..')

import numpy as np
import pandas as pd

from config import REDSHIFT_CONFIG
from src.features import *
from src.utils import *
from src.validation import *

reload(sys)
sys.setdefaultencoding('utf8')

from src.exploration import *

In [2]:
# Establish a connection to the redshift database
conn = create_rs_conn(config=REDSHIFT_CONFIG)
cur = conn.cursor()

In [3]:
#Increase the number of columns displayed with Pandas
pd.set_option('display.max_columns', 500)

In [4]:
#Function to randomly sample pitchers and tell how many of their pitch type confidences
#were above a certain threshold
def randomly_sample_pitchers3(cursor, num_pitchers = 5, min_pitch_count = 600, min_date = '2015-01-01', seed_num = None):
    '''Takes a random sample of pitchers from the db represented by "cursor" and returns a Pandas DF with
    the specified number ofpitchers who have thrown at least "min_pitch_count" pitches
    Input:
        cursor: DB handle
        num_pitchers: The number of pitchers whose data you want returned
        min_pitch_count: Minimum number of pitches a pitcher must have thrown in order to be considered in the 
            random sampling
        seed_num: If you want to be able to replicated the results, set a seed
    Output: Pandas DF containing pitch data for the randomly sampled pitchers'''
    
    cur = cursor
    
    #Get all pitchers meeting the min pitches criterion
    get_pitchers_query = '''SELECT pitcher, 
                                COUNT(*) as tot_pitch_count, 
                                MAX(date) as maximum_date,
                                SUM(CASE WHEN type_confidence BETWEEN 0.9 AND 1.0 THEN 1 ELSE 0 END) AS ninety_to_one,
                                SUM(CASE WHEN type_confidence BETWEEN 0.8 AND 0.9 THEN 1 ELSE 0 END) AS eight_to_nine,
                                SUM(CASE WHEN type_confidence BETWEEN 0.7 AND 0.8 THEN 1 ELSE 0 END) AS seven_to_eight,
                                SUM(CASE WHEN type_confidence BETWEEN 0.6 AND 0.7 THEN 1 ELSE 0 END) AS six_to_seven,
                                SUM(CASE WHEN type_confidence BETWEEN 0.5 AND 0.6 THEN 1 ELSE 0 END) AS five_to_six,
                                SUM(CASE WHEN type_confidence BETWEEN 0.4 AND 0.5 THEN 1 ELSE 0 END) AS four_to_five,
                                SUM(CASE WHEN type_confidence BETWEEN 0.3 AND 0.4 THEN 1 ELSE 0 END) AS three_to_four,
                                SUM(CASE WHEN type_confidence BETWEEN 0.2 AND 0.3 THEN 1 ELSE 0 END) AS two_to_three,
                                SUM(CASE WHEN type_confidence BETWEEN 0.1 AND 0.2 THEN 1 ELSE 0 END) AS one_to_two,
                                SUM(CASE WHEN type_confidence BETWEEN 0.0 AND 0.1 THEN 1 ELSE 0 END) AS zero_to_one
                        FROM all_pitch_data_reclass
                        GROUP BY pitcher
                        HAVING count(*) >= %d AND 
                                MAX(date) > '%s'
                        ORDER BY pitcher''' % (min_pitch_count, min_date)
    cur.execute(get_pitchers_query)
    
    #Get all the pitcher ids and sample from them
    if seed_num is not None:
        seed(seed_num)
    
    rows = cur.fetchall()
    header = [colnames[0] for colnames in cur.description]
    pitcher_df = pd.DataFrame(rows)
    pitcher_df.columns = header
    
    pitcher_id_sample = sample(pitcher_df['pitcher'].values, num_pitchers)
    
    pitcher_df = pitcher_df[pitcher_df['pitcher'].isin(pitcher_id_sample)]
    
    return pitcher_df

From these randomly sampled pitchers, it appears that some of them do have some low pitch type confidences. Let's write a function that will only use pitches about which it is at least '60%' confident and see how it performs compared to the others. NOTE: Because the 'type_confidence' is associated with every pitch, we can leave out pitches below a certain threshold for both the testing and training data. If this were not the case, then we'd have to make sure we only left them out for the training data.

In [5]:
test = randomly_sample_pitchers3(cur, num_pitchers = 10, min_pitch_count=2000, seed_num = 35)

In [6]:
test

Unnamed: 0,pitcher,tot_pitch_count,maximum_date,ninety_to_one,eight_to_nine,seven_to_eight,six_to_seven,five_to_six,four_to_five,three_to_four,two_to_three,one_to_two,zero_to_one
103,448178,5534,2015-07-21,776,279,62,38,22,5,0,0,0,67
108,448802,10166,2015-06-24,1362,722,87,54,42,8,0,0,0,34
207,476451,12380,2015-07-21,5320,4682,541,302,276,40,0,1,0,63
214,477569,4988,2015-07-21,801,352,90,46,21,7,0,0,0,77
283,518875,4450,2015-07-21,916,360,10,5,3,0,0,0,0,45
284,518886,5416,2015-07-20,914,296,10,4,4,0,0,0,0,20
286,519043,6601,2015-07-20,1039,540,166,99,65,11,0,0,0,24
327,543548,9212,2015-07-19,2255,1134,135,75,74,3,0,3,3,30
352,571901,3108,2015-07-18,1014,355,50,26,19,4,0,0,0,36
365,592767,6018,2015-05-05,1803,382,155,98,95,39,14,3,0,22


In [7]:
#Function to subset data retrieved from Redshift based on confidence
def get_pitcher_df_for_modeling2(cur, pitcher_id, binarize_pitches = True, exclude_cols = None, date_subsetting = True, table = None, conf_thresh = 0.0):
    """
    This function takes in a pitcher's ID and creates a data frame that is ready for modeling.  The features
    created with this function or determined by the 'make_features' function.

    Inputs:
        cur: Redshift db cursor
        pitcher_id: numeric pitcher id
        binarize_pitches: indicates whether or not the pitches should be split into Fastball/Offspeed or not
        exclude_cols: List of strings of any additional columns to exclude from the df that's returned
        date_subsetting: Boolean that determines whether or not to subset the data based on our
        data integrity issue with missing games
    Returns: A Pandas dataframe containing only columns which are useful for modeling
    """
    
    #Get the pitchers info from redshift and store it
    if table is None:
        raw_query = """SELECT * FROM all_pitch_data \
        WHERE game_id IN \
        (SELECT DISTINCT game_id FROM all_pitch_data \
        WHERE pitcher = %d)
        """ % pitcher_id

    else:
        raw_query = """SELECT * FROM %s \
        WHERE game_id IN \
        (SELECT DISTINCT game_id FROM %s \
        WHERE pitcher = %d)
        """ % (table, table, pitcher_id)
    sample_header, sample_rows = run_rs_query(cur, raw_query)
    pitch_df = pd.DataFrame(sample_rows)
    pitch_df.columns = sample_header
    
    
    # Add the home and away score at the pitch level to set up score_diff
    pitch_df = prepare_score_diff_df(pitch_df)
    
    # Limit to only the pitcher in question
    pitch_df = pitch_df[pitch_df['pitcher'] == pitcher_id]
    
    #Convert the date to a pandas datetime object
    pitch_df['date'] = pd.to_datetime(pitch_df['date'], '%Y-%m-%d')
    
    #Subset down to dates with correct data, if applicable
    if date_subsetting:
        #subset down after 2008 and before 2013 because of data integrity issues
        pitch_df = pitch_df[(pitch_df['date'] >= '2009-01-01') &
                            (pitch_df['date'] <= '2013-01-01')]
    
    #Binarize pitch type, if applicable
    if binarize_pitches:
        pitch_df['pitch_type'] = np.where(pitch_df['pitch_type'].isin(['FA', 'FF', 'FT', 'FC', 'FS', 'SI', 'SF']), 
                                              'Fastball', 
                                              'Not_Fastball')
    
    #Subset down based on type_confidence
    pitch_df = pitch_df[pitch_df['type_confidence'] > conf_thresh]
    
    #Make all the features encapsulated in the 'make_features' function
    pitch_df = make_features(pitch_df)
    
    #Remove pitches not containing metadata and tell the user how many were removed
    to_be_removed = len(pitch_df[pitch_df['type_confidence'].isnull()])
    pitch_df = pitch_df[pitch_df['type_confidence'].notnull()]
    print to_be_removed, "rows didn't contain pitch metadata and were removed"
    
    #Binarize the on-base variables
    pitch_df = binarize_on_base(pitch_df)
    
    #Get rid of columns that aren't useful for modeling
    cols_to_exclude = [u'game_id', u'num', u'pitcher', u'batter',
                    u'des', u'id', u'type', u'x', u'y', u'sv_id',
                    u'start_speed', u'end_speed', u'sz_top', u'sz_bot',
                    u'pfx_x', u'pfx_z', u'px', u'pz', u'x0', u'y0', 
                    u'z0', u'vx0', u'vy0', u'vz0', u'ax', u'ay', u'az', 
                    u'break_y', u'break_angle', u'break_length',
                       u'spin_dir', u'spin_rate', u'zone',
                    u'half', u'inning', u'score', u'b_height', 
                    u'event', u'event2', u'event3', u'home_team_runs', 
                    u'away_team_runs', u'p_first_name', u'p_last_name', 
                    u'p_height', u'pitcher_dob', u'b_first_name', 
                    u'b_last_name', u'batter_dob', u'game_type', 
                    u'local_game_time', u'game_pk', u'game_time_et', 
                    u'home_id', u'home_fname', u'away_id', u'away_fname',
                    u'status_ind', u'day','home_score','away_score']
    pitch_df = pitch_df.drop(cols_to_exclude, axis = 1)
    
    #Check to see if the user has specified additional cols to drop
    if exclude_cols is not None:
        pitch_df = pitch_df.drop(exclude_cols, axis = 1)
    
    #Recategorize some variables that couldn't be calculated
    pitch_df['last_pitch_type'].loc[pitch_df['last_pitch_type'].isnull()] = 'not_available'
    pitch_df['second_last_pitch_type'].loc[pitch_df['second_last_pitch_type'].isnull()] = 'not_available'
    pitch_df['third_last_pitch_type'].loc[pitch_df['third_last_pitch_type'].isnull()] = 'not_available'
    
    #Convert 'season' to a categorical indicating current season (or not)
    if u'season' in pitch_df.columns:
        pitch_df['cur_season'] = np.where(pitch_df['season'] == pitch_df['season'].max(), 1, 0)
        pitch_df.drop('season', axis = 1, inplace = True)

    #Get rid of any rows that contain NAs
    num_of_na = pitch_df.isnull().any(axis = 1).sum()
    pitch_df = pitch_df.dropna()
    print num_of_na, "rows contained at least 1 NaN and were dropped"
    
    return pitch_df

In [8]:
pitcher_list = test['pitcher'].values

In [9]:
pitcher_list

array([448178, 448802, 476451, 477569, 518875, 518886, 519043, 543548,
       571901, 592767], dtype=int64)

In [10]:
cols_of_interest = ([u'b', u's', u'on_1b', u'on_2b', u'on_3b', u'o', 
              u'home_wins',u'home_loss', u'away_wins', u'away_loss', 
              u'stand_L'] + [u'Not_Fastball_pb_prior',
                                u'Not_Fastball_pbs_prior', u'Fastball_pb_prior', u'Fastball_pbs_prior'] +
                     [u'Not_Fastball_pc_prior', u'Not_Fastball_pcs_prior',
                               u'Fastball_pc_prior', u'Fastball_pcs_prior'] + 
                    [u'Not_Fastball_pg_prior', u'Not_Fastball_pgs_prior', 
                               u'Fastball_pg_prior', u'Fastball_pgs_prior'] +
                    [u'last_pitch_type_Fastball', u'last_pitch_type_Not_Fastball',
       u'last_pitch_type_not_available', u'second_last_pitch_type_Fastball',
       u'second_last_pitch_type_Not_Fastball',
       u'second_last_pitch_type_not_available',
       u'third_last_pitch_type_Fastball',
       u'third_last_pitch_type_Not_Fastball',
       u'third_last_pitch_type_not_available', u'prev_pitches_mean_start_speed', u'prev_pitches_mean_end_speed',
       u'prev_pitches_mean_break_y', u'prev_pitches_mean_break_angle',
       u'prev_pitches_mean_break_length'] + [u'ingame_pitch_count', u'cur_season', u'season_pitch_count'])

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import RandomizedSearchCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import randint as sp_randint
from sklearn.metrics import accuracy_score

In [13]:
results_dict = {}

#Loop through all pitchers and get the different accuracy with different thresholds
for pitcher in pitcher_list:
    
    results_dict[pitcher] = {}
    
    #Get all of the pitcher's data
    pitcher_df = get_pitcher_df_for_modeling2(cur,
                                              pitcher_id = pitcher,
                                              date_subsetting = False,
                                              table = 'all_pitch_data_reclass')
    
    #Loop through all confidence ranges
    for conf in np.arange(0, 1, 0.1):
        
        #subset down to only pitches above or equal to the confidence threshold
        new_pitch_df = pitcher_df[pitcher_df['type_confidence'] >= conf]

        #Sort the dataframe and create a 90-10 split by date
        subset_date = str(new_pitch_df['date'].quantile(.9))[:10]
        modeling_data = split_test_train(new_pitch_df, subset_date)

        #Subset down to the columns of interest
        baseline_dict = subset_data(modeling_data, cols_of_interest)

        #Run a bunch of models
        classifier_dict = run_all_classifiers(baseline_dict)
        
        #Get the predictions and choose the combo which gives the best accuracy
        preds_dict = collect_classifier_predictions2(baseline_dict, classifier_dict)
        results = choose_best_ensemble(preds_dict, baseline_dict)
        
        #Store all the pertinent information in the results dictionary for the pitcher
        results_dict[pitcher][conf] = {}
        results_dict[pitcher][conf]['naive_acc'] = naive_accuracy(baseline_dict)
        results_dict[pitcher][conf]['best_acc'] = results['best_acc']
        results_dict[pitcher][conf]['acc_diff'] = results_dict[pitcher][conf]['best_acc'] - results_dict[pitcher][conf]['naive_acc']
        results_dict[pitcher][conf]['obs'] = len(baseline_dict['train_data'])
    
    results_dict[pitcher]['best_improve'] = max([results_dict[pitcher][conf]['acc_diff'] for conf in np.arange(0, 1, 0.1)])
    results_dict[pitcher]['improve_from_base'] = results_dict[pitcher]['best_improve'] - results_dict[pitcher][0.]['acc_diff']

In [15]:
for pitcher in pitcher_list:
    print "For pitcher:", pitcher
    print 'best_improve_conf:', results_dict[pitcher]['best_improve']
    print 'nominal improvement:', results_dict[pitcher]['improve_from_base'], '\n'

In [16]:
results_dict

{448178: {0.0: {'acc_diff': 0.018499075785582297,
   'best_acc': 0.53049907578558231,
   'naive_acc': 0.512,
   'obs': 4854},
  0.10000000000000001: {'acc_diff': 0.01665064695009244,
   'best_acc': 0.52865064695009245,
   'naive_acc': 0.512,
   'obs': 4854},
  0.20000000000000001: {'acc_diff': 0.014802218114602583,
   'best_acc': 0.52680221811460259,
   'naive_acc': 0.512,
   'obs': 4854},
  0.30000000000000004: {'acc_diff': 0.01665064695009244,
   'best_acc': 0.52865064695009245,
   'naive_acc': 0.512,
   'obs': 4854},
  0.40000000000000002: {'acc_diff': 0.018499075785582297,
   'best_acc': 0.53049907578558231,
   'naive_acc': 0.512,
   'obs': 4854},
  0.5: {'acc_diff': 0.01100000000000001,
   'best_acc': 0.52500000000000002,
   'naive_acc': 0.514,
   'obs': 4830},
  0.60000000000000009: {'acc_diff': 0.014440366972477081,
   'best_acc': 0.52844036697247709,
   'naive_acc': 0.514,
   'obs': 4823},
  0.70000000000000007: {'acc_diff': 0.0089455535390199348,
   'best_acc': 0.5299455535390