# Accuracy across Randomly Sampled Pitchers 

In [1]:
#Import necessary packages
import psycopg2
import sys  
sys.path.append('..')

import numpy as np
import pandas as pd

from config import REDSHIFT_CONFIG
from src.features import *
from src.utils import *
from src.validation import *

reload(sys)
sys.setdefaultencoding('utf8')

from src.exploration import *

In [2]:
# Establish a connection to the redshift database
conn = create_rs_conn(config=REDSHIFT_CONFIG)
cur = conn.cursor()

In [3]:
#Increase the number of columns displayed with Pandas
pd.set_option('display.max_columns', 500)

In [4]:
#Get a random sample of 20 pitchers. NOTE: you might want to return less pitchers to make the code
#run faster
pitcher_sample = randomly_sample_pitchers2(cursor = cur, 
                                           num_pitchers = 20, 
                                           min_pitch_count = 2000, 
                                           min_date = '2015-01-01', 
                                           seed_num = 30)
#Show the results
pitcher_sample

Unnamed: 0,pitcher,tot_pitch_count,maximum_date
11,279571,7676,2015-06-25
27,407908,6798,2015-07-21
60,433584,17338,2015-07-19
79,435400,5466,2015-07-21
91,445590,10035,2015-05-26
97,446372,9069,2015-07-18
109,449079,5115,2015-04-10
150,456034,21870,2015-07-18
174,460008,3007,2015-04-28
202,475138,3243,2015-04-19


In [5]:
#Grab the pitcher IDs from the dataframe above
pitcher_list = pitcher_sample['pitcher'].values

In [7]:
# Create a list of the columns that we're interested in using as features
cols_of_interest = ([u'b', u's', u'on_1b', u'on_2b', u'on_3b', u'o',
                     u'home_wins',u'home_loss', u'away_wins', u'away_loss',
                     u'stand_L', u'Not_Fastball_pb_prior', u'Not_Fastball_pbs_prior', 
                     u'Fastball_pb_prior', u'Fastball_pbs_prior', u'Not_Fastball_pc_prior', 
                     u'Not_Fastball_pcs_prior', u'Fastball_pc_prior', u'Fastball_pcs_prior', 
                     u'Not_Fastball_pg_prior', u'Not_Fastball_pgs_prior', 
                     u'Fastball_pg_prior', u'Fastball_pgs_prior', u'last_pitch_type_Fastball', 
                     u'last_pitch_type_Not_Fastball',u'last_pitch_type_not_available', 
                     u'second_last_pitch_type_Fastball',u'second_last_pitch_type_Not_Fastball',
                     u'second_last_pitch_type_not_available', u'third_last_pitch_type_Fastball',
                     u'third_last_pitch_type_Not_Fastball', u'third_last_pitch_type_not_available', 
                     u'prev_pitches_mean_start_speed', u'prev_pitches_mean_end_speed',
                     u'prev_pitches_mean_break_y', u'prev_pitches_mean_break_angle',
                     u'prev_pitches_mean_break_length', u'ingame_pitch_count', u'cur_season', u'season_pitch_count'])

In [9]:
#Initialize a dictionary to store the results
results_dict = {}

#Loop through all the pitchers, run the classifiers on them and return the baseline and a classifier accuracies
for pitcher in pitcher_list:
    
    
    print 'starting pitcher', pitcher, '\n'
    
    #using exception handling in case redshift times out
    try:
        #Grab all the pitch data for the pitcher
        pitcher_df = get_pitcher_df_for_modeling(cur,
                                                 pitcher_id = pitcher,
                                                 date_subsetting = False,
                                                 table = 'all_pitch_data_reclass')
    except:
        # Re-establish a connection to redshift
        conn = create_rs_conn(config=REDSHIFT_CONFIG)
        cur = conn.cursor()

        #Retry on the pitchers
        pitcher_df = get_pitcher_df_for_modeling(cur, 
                                pitcher_id = pitcher, 
                                date_subsetting = False,
                                table = 'all_pitch_data_reclass')

    #Create a 90-10 split by date on the pitch data
    subset_date = str(pitcher_df['date'].quantile(.9))[:10]
    modeling_data = split_test_train(pitcher_df, subset_date)

    #Subset the dataframe down to the columns of interest
    baseline_dict = subset_data(modeling_data, cols_of_interest)
    
    #Run 4 classifiers on the data (returns dictionary containing all fitted classifiers)
    classifier_dict = run_all_classifiers(baseline_dict)
    
    #Grab predictions for each classifier
    rf_preds = classifier_dict['rf'].predict(baseline_dict['test_data'])
    gbm_preds = classifier_dict['gbm'].predict(baseline_dict['test_data'])
    log_reg_preds = classifier_dict['log_reg'].predict(baseline_dict['test_data'])
    lin_svc_preds = classifier_dict['lin_svc'].predict(baseline_dict['test_data'])
    
    #Create an insert in the results dictionary for the pitcher and store the accuracy results
    results_dict[pitcher] = {}
    results_dict[pitcher]['baseline_acc'] = naive_accuracy(baseline_dict)
    results_dict[pitcher]['rf_improve_over_baseline'] = accuracy_score(baseline_dict['test_targets'], rf_preds) - naive_accuracy(baseline_dict)
    results_dict[pitcher]['rf_acc'] = accuracy_score(baseline_dict['test_targets'], rf_preds)
    results_dict[pitcher]['gbm_acc'] = accuracy_score(baseline_dict['test_targets'], gbm_preds)
    results_dict[pitcher]['log_reg_acc'] = accuracy_score(baseline_dict['test_targets'], log_reg_preds)
    results_dict[pitcher]['lin_svc_acc'] = accuracy_score(baseline_dict['test_targets'], lin_svc_preds)

In [10]:
results_dict

{279571: {'baseline_acc': 0.595,
  'gbm_acc': 0.60438144329896903,
  'lin_svc_acc': 0.60180412371134018,
  'log_reg_acc': 0.59536082474226804,
  'rf_acc': 0.59664948453608246,
  'rf_improve_over_baseline': 0.0016494845360824906},
 407908: {'baseline_acc': 0.789,
  'gbm_acc': 0.78908554572271383,
  'lin_svc_acc': 0.78908554572271383,
  'log_reg_acc': 0.78908554572271383,
  'rf_acc': 0.78908554572271383,
  'rf_improve_over_baseline': 8.55457227137979e-05},
 433584: {'baseline_acc': 0.661,
  'gbm_acc': 0.66025641025641024,
  'lin_svc_acc': 0.66142191142191142,
  'log_reg_acc': 0.66083916083916083,
  'rf_acc': 0.66142191142191142,
  'rf_improve_over_baseline': 0.00042191142191139175},
 435400: {'baseline_acc': 1.0,
  'gbm_acc': 0.85321100917431192,
  'lin_svc_acc': 1.0,
  'log_reg_acc': 1.0,
  'rf_acc': 1.0,
  'rf_improve_over_baseline': 0.0},
 445590: {'baseline_acc': 0.543,
  'gbm_acc': 0.51130776794493604,
  'lin_svc_acc': 0.52409046214355948,
  'log_reg_acc': 0.5260570304818093,
  'rf_