# Accuracy across Randomly Sampled Pitchers 

In [1]:
#Import necessary packages
import psycopg2
import sys  
sys.path.append('..')

import numpy as np
import pandas as pd

from config import REDSHIFT_CONFIG
from src.features import *
from src.utils import *
from src.validation import *

reload(sys)
sys.setdefaultencoding('utf8')

from src.exploration import *

In [11]:
# Establish a connection to the redshift database
conn = create_rs_conn(config=REDSHIFT_CONFIG)
cur = conn.cursor()

In [3]:
#Increase the number of columns displayed with Pandas
pd.set_option('display.max_columns', 500)

In [12]:
#Get a random sample of 20 pitchers. NOTE: you might want to return less pitchers to make the code
#run faster
pitcher_sample = randomly_sample_pitchers2(cursor = cur, 
                                           num_pitchers = 10, 
                                           min_pitch_count = 2000, 
                                           min_date = '2015-01-01', 
                                           seed_num = 35)
#Show the results
pitcher_sample

Unnamed: 0,pitcher,tot_pitch_count,maximum_date
103,448178,5534,2015-07-21
108,448802,10166,2015-06-24
207,476451,12380,2015-07-21
214,477569,4988,2015-07-21
283,518875,4450,2015-07-21
284,518886,5416,2015-07-20
286,519043,6601,2015-07-20
327,543548,9212,2015-07-19
352,571901,3108,2015-07-18
365,592767,6018,2015-05-05


In [13]:
#Grab the pitcher IDs from the dataframe above
pitcher_list = pitcher_sample['pitcher'].values

In [14]:
# Create a list of the columns that we're interested in using as features
cols_of_interest = ([u'b', u's', u'on_1b', u'on_2b', u'on_3b', u'o',
                     u'home_wins',u'home_loss', u'away_wins', u'away_loss',
                     u'stand_L', u'Not_Fastball_pb_prior', u'Not_Fastball_pbs_prior', 
                     u'Fastball_pb_prior', u'Fastball_pbs_prior', u'Not_Fastball_pc_prior', 
                     u'Not_Fastball_pcs_prior', u'Fastball_pc_prior', u'Fastball_pcs_prior', 
                     u'Not_Fastball_pg_prior', u'Not_Fastball_pgs_prior', 
                     u'Fastball_pg_prior', u'Fastball_pgs_prior', u'last_pitch_type_Fastball', 
                     u'last_pitch_type_Not_Fastball',u'last_pitch_type_not_available', 
                     u'second_last_pitch_type_Fastball',u'second_last_pitch_type_Not_Fastball',
                     u'second_last_pitch_type_not_available', u'third_last_pitch_type_Fastball',
                     u'third_last_pitch_type_Not_Fastball', u'third_last_pitch_type_not_available', 
                     u'prev_pitches_mean_start_speed', u'prev_pitches_mean_end_speed',
                     u'prev_pitches_mean_break_y', u'prev_pitches_mean_break_angle',
                     u'prev_pitches_mean_break_length', u'ingame_pitch_count', u'cur_season', u'season_pitch_count'])

In [15]:
#Initialize a dictionary to store the results
results_dict = {}

#Loop through all the pitchers, run the classifiers on them and return the baseline and a classifier accuracies
for pitcher in pitcher_list:
    
    
    print 'starting pitcher', pitcher, '\n'
    
    #using exception handling in case redshift times out
    try:
        #Grab all the pitch data for the pitcher
        pitcher_df = get_pitcher_df_for_modeling(cur,
                                                 pitcher_id = pitcher,
                                                 date_subsetting = False)
    except:
        # Re-establish a connection to redshift
        conn = create_rs_conn(config=REDSHIFT_CONFIG)
        cur = conn.cursor()

        #Retry on the pitchers
        pitcher_df = get_pitcher_df_for_modeling(cur, 
                                pitcher_id = pitcher, 
                                date_subsetting = False)

    #Create a 90-10 split by date on the pitch data
    subset_date = str(pitcher_df['date'].quantile(.9))[:10]
    modeling_data = split_test_train(pitcher_df, subset_date)

    #Subset the dataframe down to the columns of interest
    baseline_dict = subset_data(modeling_data, cols_of_interest)
    
    #Run 4 classifiers on the data (returns dictionary containing all fitted classifiers)
    classifier_dict = run_all_classifiers(baseline_dict)
    
    #Grab predictions for each classifier
    rf_preds = classifier_dict['rf'].predict(baseline_dict['test_data'])
    gbm_preds = classifier_dict['gbm'].predict(baseline_dict['test_data'])
    log_reg_preds = classifier_dict['log_reg'].predict(baseline_dict['test_data'])
    lin_svc_preds = classifier_dict['lin_svc'].predict(baseline_dict['test_data'])
    
    #Create an insert in the results dictionary for the pitcher and store the accuracy results
    results_dict[pitcher] = {}
    results_dict[pitcher]['baseline_acc'] = naive_accuracy(baseline_dict)
    results_dict[pitcher]['rf_improve_over_baseline'] = accuracy_score(baseline_dict['test_targets'], rf_preds) - naive_accuracy(baseline_dict)
    results_dict[pitcher]['rf_acc'] = accuracy_score(baseline_dict['test_targets'], rf_preds)
    results_dict[pitcher]['gbm_acc'] = accuracy_score(baseline_dict['test_targets'], gbm_preds)
    results_dict[pitcher]['log_reg_acc'] = accuracy_score(baseline_dict['test_targets'], log_reg_preds)
    results_dict[pitcher]['lin_svc_acc'] = accuracy_score(baseline_dict['test_targets'], lin_svc_preds)

In [16]:
results_dict

{448178: {'baseline_acc': 0.677,
  'gbm_acc': 0.70848056537102477,
  'lin_svc_acc': 0.71554770318021199,
  'log_reg_acc': 0.70671378091872794,
  'rf_acc': 0.70141342756183744,
  'rf_improve_over_baseline': 0.024413427561837397},
 448802: {'baseline_acc': 0.592,
  'gbm_acc': 0.68380213385063049,
  'lin_svc_acc': 0.67216294859359849,
  'log_reg_acc': 0.66828322017458774,
  'rf_acc': 0.67895247332686715,
  'rf_improve_over_baseline': 0.086952473326867175},
 476451: {'baseline_acc': 0.573,
  'gbm_acc': 0.61386138613861385,
  'lin_svc_acc': 0.62604722010662606,
  'log_reg_acc': 0.61614623000761615,
  'rf_acc': 0.6100533130236101,
  'rf_improve_over_baseline': 0.037053313023610146},
 477569: {'baseline_acc': 0.533,
  'gbm_acc': 0.50853889943074004,
  'lin_svc_acc': 0.51992409867172673,
  'log_reg_acc': 0.56166982922201136,
  'rf_acc': 0.50474383301707781,
  'rf_improve_over_baseline': -0.028256166982922215},
 518875: {'baseline_acc': 0.518,
  'gbm_acc': 0.56415929203539827,
  'lin_svc_acc': 

In [17]:
#Initialize a dictionary to store the results
results_dict2 = {}

#Loop through all the pitchers, run the classifiers on them and return the baseline and a classifier accuracies
for pitcher in pitcher_list:
    
    
    print 'starting pitcher', pitcher, '\n'
    
    #using exception handling in case redshift times out
    try:
        #Grab all the pitch data for the pitcher
        pitcher_df = get_pitcher_df_for_modeling(cur,
                                                 pitcher_id = pitcher,
                                                 date_subsetting = False)
    except:
        # Re-establish a connection to redshift
        conn = create_rs_conn(config=REDSHIFT_CONFIG)
        cur = conn.cursor()

        #Retry on the pitchers
        pitcher_df = get_pitcher_df_for_modeling(cur, 
                                pitcher_id = pitcher, 
                                date_subsetting = False)

    #Create a 90-10 split by date on the pitch data
    subset_date = str(pitcher_df['date'].quantile(.9))[:10]
    modeling_data = split_test_train(pitcher_df, subset_date)

    #Subset the dataframe down to the columns of interest
    baseline_dict = modeling_data
    
    #Run 4 classifiers on the data (returns dictionary containing all fitted classifiers)
    classifier_dict = run_all_classifiers(baseline_dict)
    
    #Grab predictions for each classifier
    rf_preds = classifier_dict['rf'].predict(baseline_dict['test_data'])
    gbm_preds = classifier_dict['gbm'].predict(baseline_dict['test_data'])
    log_reg_preds = classifier_dict['log_reg'].predict(baseline_dict['test_data'])
    lin_svc_preds = classifier_dict['lin_svc'].predict(baseline_dict['test_data'])
    
    #Create an insert in the results dictionary for the pitcher and store the accuracy results
    results_dict2[pitcher] = {}
    results_dict2[pitcher]['baseline_acc'] = naive_accuracy(baseline_dict)
    results_dict2[pitcher]['rf_improve_over_baseline'] = accuracy_score(baseline_dict['test_targets'], rf_preds) - naive_accuracy(baseline_dict)
    results_dict2[pitcher]['rf_acc'] = accuracy_score(baseline_dict['test_targets'], rf_preds)
    results_dict2[pitcher]['gbm_acc'] = accuracy_score(baseline_dict['test_targets'], gbm_preds)
    results_dict2[pitcher]['log_reg_acc'] = accuracy_score(baseline_dict['test_targets'], log_reg_preds)
    results_dict2[pitcher]['lin_svc_acc'] = accuracy_score(baseline_dict['test_targets'], lin_svc_preds)

In [18]:
results_dict2

{448178: {'baseline_acc': 0.677,
  'gbm_acc': 0.6872791519434629,
  'lin_svc_acc': 0.71908127208480566,
  'log_reg_acc': 0.70494699646643111,
  'rf_acc': 0.69964664310954061,
  'rf_improve_over_baseline': 0.022646643109540565},
 448802: {'baseline_acc': 0.592,
  'gbm_acc': 0.67895247332686715,
  'lin_svc_acc': 0.66925315227934046,
  'log_reg_acc': 0.67701260911736183,
  'rf_acc': 0.68186226964112517,
  'rf_improve_over_baseline': 0.089862269641125203},
 476451: {'baseline_acc': 0.573,
  'gbm_acc': 0.61462300076161458,
  'lin_svc_acc': 0.60091393754760092,
  'log_reg_acc': 0.60700685453160697,
  'rf_acc': 0.60700685453160697,
  'rf_improve_over_baseline': 0.034006854531607011},
 477569: {'baseline_acc': 0.533,
  'gbm_acc': 0.52371916508538896,
  'lin_svc_acc': 0.5161290322580645,
  'log_reg_acc': 0.5597722960151803,
  'rf_acc': 0.55787476280834913,
  'rf_improve_over_baseline': 0.024874762808349105},
 518875: {'baseline_acc': 0.518,
  'gbm_acc': 0.56858407079646023,
  'lin_svc_acc': 0.5