# Determine if Hyper-Parameters are Distinct or Common Across Pitchers 

In [1]:
#Import necessary packages
import psycopg2
import sys  

import numpy as np
import pandas as pd

from config import REDSHIFT_CONFIG
from src.features import *
from src.utils import *
from src.validation import *

reload(sys)
sys.setdefaultencoding('utf8')

from src.exploration import *

In [2]:
# Establish a connection to the redshift database
conn = create_rs_conn(config=REDSHIFT_CONFIG)
cur = conn.cursor()

In [3]:
#Increase the number of columns displayed with Pandas
pd.set_option('display.max_columns', 500)

In [4]:
cur.execute('''select max(date) from all_pitch_data_reclass limit 10;''')
rows = cur.fetchall()
header = [colnames[0] for colnames in cur.description]
test = pd.DataFrame(rows)
test.columns = header
test

Unnamed: 0,max
0,2015-07-21


Modify the function to randomly sample pitchers from the db

In [5]:
from random import sample, seed
def randomly_sample_pitchers2(cursor, num_pitchers = 5, min_pitch_count = 600, min_date = '2015-01-01', seed_num = None):
    '''Takes a random sample of pitchers from the db represented by "cursor" and returns a Pandas DF with
    the specified number ofpitchers who have thrown at least "min_pitch_count" pitches
    Input:
        cursor: DB handle
        num_pitchers: The number of pitchers whose data you want returned
        min_pitch_count: Minimum number of pitches a pitcher must have thrown in order to be considered in the 
            random sampling
        seed_num: If you want to be able to replicated the results, set a seed
    Output: Pandas DF containing pitch data for the randomly sampled pitchers'''
    
    cur = cursor
    
    #Get all pitchers meeting the min pitches criterion
    get_pitchers_query = '''SELECT pitcher, 
                                COUNT(*) as tot_pitch_count, 
                                MAX(date) as maximum_date
                        FROM all_pitch_data_reclass
                        GROUP BY pitcher
                        HAVING count(*) >= %d AND 
                                MAX(date) > '%s'
                        ORDER BY pitcher''' % (min_pitch_count, min_date)
    cur.execute(get_pitchers_query)
    
    #Get all the pitcher ids and sample from them
    if seed_num is not None:
        seed(seed_num)
    
    rows = cur.fetchall()
    header = [colnames[0] for colnames in cur.description]
    pitcher_df = pd.DataFrame(rows)
    pitcher_df.columns = header
    
    pitcher_id_sample = sample(pitcher_df['pitcher'].values, num_pitchers)
    
    pitcher_df = pitcher_df[pitcher_df['pitcher'].isin(pitcher_id_sample)]
    
    return pitcher_df

In [13]:
#test it out
pitcher_list = randomly_sample_pitchers2(cur, min_pitch_count = 1000, seed_num = 35, num_pitchers=10)

In [14]:
pitcher_list

Unnamed: 0,pitcher,tot_pitch_count,maximum_date
121,450729,17088,2015-07-18
126,451661,2112,2015-07-21
242,493159,1660,2015-06-22
250,501822,1936,2015-06-14
330,523989,4795,2015-07-20
331,527048,3383,2015-07-17
333,533167,3741,2015-07-21
382,547888,3127,2015-07-17
410,573204,1561,2015-04-30
426,595307,2647,2015-07-21


In [20]:
#Function to calculate the naive accuracy for a pitcher
def naive_accuracy(data_dict):
    biggest_count = data_dict['test_targets'].value_counts()[0]
    all_counts = data_dict['test_targets'].value_counts().sum()
    return round(float(biggest_count) / all_counts, 3)

In [9]:
def subset_data(modeling_dict, cols_of_interest):
    new_dict = modeling_dict.copy()
    new_dict['train_data'] = new_dict['train_data'][cols_of_interest]
    new_dict['test_data'] = new_dict['test_data'][cols_of_interest]
    return new_dict

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import RandomizedSearchCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import randint as sp_randint
from sklearn.metrics import accuracy_score

In [11]:
def run_randomsearch_classifier2(classifier, data_dict):
    """Given a classifier and a data dictionary containing 'train_data' and 'test_data' (as pandas DFs),
    This runs the classifier and outputs the accuracy of the classifier on the test data."""
    
    classifier_dict = {}
    
    # Fit a model on all the data and features
    classifier.fit(data_dict['train_data'], data_dict['train_targets'])
    
    #print the best parameters
    classifier_dict['params'] = classifier.best_params_
    
    classifier_dict['baseline_accuracy'] = naive_accuracy(data_dict)
    classifier_dict['accuracy'] = accuracy_score(data_dict['test_targets'], 
                                                 classifier.predict(data_dict['test_data']))
    
    # Return the dev performance score.
    return classifier_dict

In [12]:
cols_of_interest = ([u'b', u's', u'on_1b', u'on_2b', u'on_3b', u'o', 
              u'home_wins',u'home_loss', u'away_wins', u'away_loss', 
              u'stand_L'] + [u'Not_Fastball_pb_prior',
                                u'Not_Fastball_pbs_prior', u'Fastball_pb_prior', u'Fastball_pbs_prior'] +
                     [u'Not_Fastball_pc_prior', u'Not_Fastball_pcs_prior',
                               u'Fastball_pc_prior', u'Fastball_pcs_prior'] + 
                    [u'Not_Fastball_pg_prior', u'Not_Fastball_pgs_prior', 
                               u'Fastball_pg_prior', u'Fastball_pgs_prior'] +
                    [u'last_pitch_type_Fastball', u'last_pitch_type_Not_Fastball',
       u'last_pitch_type_not_available', u'second_last_pitch_type_Fastball',
       u'second_last_pitch_type_Not_Fastball',
       u'second_last_pitch_type_not_available',
       u'third_last_pitch_type_Fastball',
       u'third_last_pitch_type_Not_Fastball',
       u'third_last_pitch_type_not_available', u'prev_pitches_mean_start_speed', u'prev_pitches_mean_end_speed',
       u'prev_pitches_mean_break_y', u'prev_pitches_mean_break_angle',
       u'prev_pitches_mean_break_length'] + [u'ingame_pitch_count', u'cur_season', u'season_pitch_count'])

In [19]:
#Get the parameters for the four models to search over
log_params = {'penalty': ['l1', 'l2'],
              'C' : [0.1, 0.5, 1.0, 2.0, 5.0, 7.0],
              'random_state' : [35]}
gbm_params = {'loss' : ['deviance'],
              'n_estimators' : [10, 50, 100, 150, 250, 350, 500],
              "max_depth": [3, None],
              "max_features": [None, 'auto'],
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
             'random_state': [35]}
rf_params = {'n_estimators': [10, 50, 100, 150, 250, 350, 500],
                "max_depth": [3, None],
              "max_features": [None, 'auto'],
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
             'random_state': [35]}
lin_svc_params = {'C' : [0.1, 0.5, 0.7, 1.0, 2.0, 5.0],
                  'penalty' : ['l1', 'l2'],
                  'dual' : [False],
                  'random_state' : [35]}
n_iter_search = 25

In [20]:
#Get the list of pitcher ids to loop through
pitcher_list = test['pitcher'].values

#Initialize the dictionaries to hold the results
rf_dict = {}
gbm_dict = {}
log_dict = {}
lin_svc_dict = {}

#Run through the pitchers
for pitcher in pitcher_list:
    
    #Get the pitchers data
    try:
        pitcher_df = get_pitcher_df_for_modeling(cur, 
                                pitcher_id = pitcher, 
                                date_subsetting = False)
    except:
        # Establish a connection to the redshift database
        conn = create_rs_conn(config=REDSHIFT_CONFIG)
        cur = conn.cursor()
        
        #Retry on the pitchers
        pitcher_df = get_pitcher_df_for_modeling(cur, 
                                pitcher_id = pitcher, 
                                date_subsetting = False)
    
    #Sort the dataframe and create a 90-10 split by date
    subset_date = str(pitcher_df['date'].quantile(.9))[:10]
    modeling_data = split_test_train(pitcher_df, subset_date)
    
    #Subset down to the columns of interest
    baseline_dict = subset_data(modeling_data, cols_of_interest)

    #Random Search over 4 different classifiers
    rf_dict[pitcher] = run_randomsearch_classifier2(RandomizedSearchCV(RandomForestClassifier(),
                                                                 param_distributions = rf_params,
                                                                 n_iter = n_iter_search),
                                              baseline_dict)

    gbm_dict[pitcher] = run_randomsearch_classifier2(RandomizedSearchCV(GradientBoostingClassifier(),
                                                                 param_distributions = gbm_params,
                                                                 n_iter = n_iter_search),
                                              baseline_dict)

    log_dict[pitcher] = run_randomsearch_classifier2(RandomizedSearchCV(LogisticRegression(),
                                                                 param_distributions = log_params,
                                                                 n_iter = n_iter_search),
                                              baseline_dict)

    lin_svc_dict[pitcher] = run_randomsearch_classifier2(RandomizedSearchCV(LinearSVC(),
                                                                 param_distributions = lin_svc_params,
                                                                 n_iter = n_iter_search),
                                              baseline_dict)

In [3]:
rf_dict = {450729: {'accuracy': 0.80372250423011848,
  'baseline_accuracy': 0.702,
  'params': {'max_depth': 3,
   'max_features': 'auto',
   'min_samples_leaf': 10,
   'min_samples_split': 6,
   'n_estimators': 50,
   'random_state': 35}},
 451661: {'accuracy': 0.71162790697674416,
  'baseline_accuracy': 0.701,
  'params': {'max_depth': 3,
   'max_features': 'auto',
   'min_samples_leaf': 10,
   'min_samples_split': 10,
   'n_estimators': 500,
   'random_state': 35}},
 493159: {'accuracy': 0.71359223300970875,
  'baseline_accuracy': 0.585,
  'params': {'max_depth': 3,
   'max_features': 'auto',
   'min_samples_leaf': 8,
   'min_samples_split': 7,
   'n_estimators': 50,
   'random_state': 35}},
 501822: {'accuracy': 0.78500000000000003,
  'baseline_accuracy': 0.649,
  'params': {'max_depth': 3,
   'max_features': 'auto',
   'min_samples_leaf': 8,
   'min_samples_split': 1,
   'n_estimators': 350,
   'random_state': 35}},
 523989: {'accuracy': 0.63148148148148153,
  'baseline_accuracy': 0.646,
  'params': {'max_depth': 3,
   'max_features': 'auto',
   'min_samples_leaf': 8,
   'min_samples_split': 6,
   'n_estimators': 500,
   'random_state': 35}},
 527048: {'accuracy': 0.61864406779661019,
  'baseline_accuracy': 0.595,
  'params': {'max_depth': 3,
   'max_features': 'auto',
   'min_samples_leaf': 2,
   'min_samples_split': 9,
   'n_estimators': 150,
   'random_state': 35}},
 533167: {'accuracy': 0.61007957559681703,
  'baseline_accuracy': 0.51,
  'params': {'max_depth': 3,
   'max_features': 'auto',
   'min_samples_leaf': 7,
   'min_samples_split': 10,
   'n_estimators': 50,
   'random_state': 35}},
 547888: {'accuracy': 0.69009584664536738,
  'baseline_accuracy': 0.722,
  'params': {'max_depth': 3,
   'max_features': None,
   'min_samples_leaf': 8,
   'min_samples_split': 5,
   'n_estimators': 50,
   'random_state': 35}},
 573204: {'accuracy': 0.71604938271604934,
  'baseline_accuracy': 0.656,
  'params': {'max_depth': 3,
   'max_features': None,
   'min_samples_leaf': 10,
   'min_samples_split': 9,
   'n_estimators': 10,
   'random_state': 35}},
 595307: {'accuracy': 0.64552238805970152,
  'baseline_accuracy': 0.706,
  'params': {'max_depth': 3,
   'max_features': 'auto',
   'min_samples_leaf': 9,
   'min_samples_split': 2,
   'n_estimators': 500,
   'random_state': 35}}}

In [21]:
pitcher_list

array([450729, 451661, 493159, 501822, 523989, 527048, 533167, 547888,
       573204, 595307], dtype=int64)

In [23]:
accuracy_dict = {}
pitcher = pitcher_list[0]

accuracy_dict[pitcher] = {}
try:
    pitcher_df = get_pitcher_df_for_modeling(cur, 
                            pitcher_id = pitcher, 
                            date_subsetting = False)
except:
    # Establish a connection to the redshift database
    conn = create_rs_conn(config=REDSHIFT_CONFIG)
    cur = conn.cursor()

    #Retry on the pitchers
    pitcher_df = get_pitcher_df_for_modeling(cur, 
                            pitcher_id = pitcher, 
                            date_subsetting = False)

#Sort the dataframe and create a 90-10 split by date
subset_date = str(pitcher_df['date'].quantile(.9))[:10]
modeling_data = split_test_train(pitcher_df, subset_date)

#Subset down to the columns of interest
baseline_dict = subset_data(modeling_data, cols_of_interest)

accuracy_list = []
diff_from_best = []
print 'naive_accuracy:', rf_dict[pitcher]['baseline_accuracy']
print 'best_accuracy:', rf_dict[pitcher]['accuracy']
for new_pitcher in pitcher_list:
    rf_test = RandomForestClassifier(min_samples_leaf= rf_dict[new_pitcher]['params']['min_samples_leaf'], 
                                     min_samples_split=rf_dict[new_pitcher]['params']['min_samples_split'],
                                     n_estimators=rf_dict[new_pitcher]['params']['n_estimators'])
    new_acc = run_classifier(rf_test, baseline_dict)
    accuracy_list.append(new_acc)
    diff_from_best.append(new_acc - rf_dict[pitcher]['accuracy'])

print 'Acc. list:', accuracy_list
print 'Acc. Mean:', pd.Series(accuracy_list).mean()
print 'Acc. SD:', pd.Series(accuracy_list).std()
print 'Acc. diff list:', diff_from_best
print 'Acc. diff Mean:', pd.Series(diff_from_best).mean()
print 'Acc. diff SD:', pd.Series(diff_from_best).std()

accuracy_dict[pitcher]['acc_list'] = accuracy_list
accuracy_dict[pitcher]['acc_mean'] = pd.Series(accuracy_list).mean()
accuracy_dict[pitcher]['acc_std'] = pd.Series(accuracy_list).std()
accuracy_dict[pitcher]['acc_diff_list'] = diff_from_best
accuracy_dict[pitcher]['acc_diff_mean'] = pd.Series(diff_from_best).mean()
accuracy_dict[pitcher]['acc_diff_sd'] = pd.Series(diff_from_best).std()

In [24]:
accuracy_dict = {}
for pitcher in pitcher_list:
    
    print 'starting pitcher', pitcher, '\n'
    
    accuracy_dict[pitcher] = {}
    
    try:
        pitcher_df = get_pitcher_df_for_modeling(cur, 
                                pitcher_id = pitcher, 
                                date_subsetting = False)
    except:
        # Establish a connection to the redshift database
        conn = create_rs_conn(config=REDSHIFT_CONFIG)
        cur = conn.cursor()

        #Retry on the pitchers
        pitcher_df = get_pitcher_df_for_modeling(cur, 
                                pitcher_id = pitcher, 
                                date_subsetting = False)

    #Sort the dataframe and create a 90-10 split by date
    subset_date = str(pitcher_df['date'].quantile(.9))[:10]
    modeling_data = split_test_train(pitcher_df, subset_date)

    #Subset down to the columns of interest
    baseline_dict = subset_data(modeling_data, cols_of_interest)

    accuracy_list = []
    diff_from_best = []
    print 'naive_accuracy:', rf_dict[pitcher]['baseline_accuracy']
    print 'best_accuracy:', rf_dict[pitcher]['accuracy']
    for new_pitcher in pitcher_list:
        rf_test = RandomForestClassifier(min_samples_leaf= rf_dict[new_pitcher]['params']['min_samples_leaf'], 
                                         min_samples_split=rf_dict[new_pitcher]['params']['min_samples_split'],
                                         n_estimators=rf_dict[new_pitcher]['params']['n_estimators'])
        new_acc = run_classifier(rf_test, baseline_dict)
        accuracy_list.append(new_acc)
        diff_from_best.append(new_acc - rf_dict[pitcher]['accuracy'])

    print 'Acc. list:', accuracy_list
    print 'Acc. Mean:', pd.Series(accuracy_list).mean()
    print 'Acc. SD:', pd.Series(accuracy_list).std()
    print 'Acc. diff list:', diff_from_best
    print 'Acc. diff Mean:', pd.Series(diff_from_best).mean()
    print 'Acc. diff SD:', pd.Series(diff_from_best).std()

    accuracy_dict[pitcher]['acc_list'] = accuracy_list
    accuracy_dict[pitcher]['acc_mean'] = pd.Series(accuracy_list).mean()
    accuracy_dict[pitcher]['acc_std'] = pd.Series(accuracy_list).std()
    accuracy_dict[pitcher]['acc_diff_list'] = diff_from_best
    accuracy_dict[pitcher]['acc_diff_mean'] = pd.Series(diff_from_best).mean()
    accuracy_dict[pitcher]['acc_diff_sd'] = pd.Series(diff_from_best).std()

In [25]:
accuracy_dict

{450729: {'acc_diff_list': [-0.00056401579244225175,
   -0.00056401579244225175,
   -0.002256063169768785,
   -0.0011280315848843925,
   0.0,
   -0.0090242526790750288,
   0.0,
   -0.002256063169768785,
   -0.021432600112803235,
   -0.0011280315848843925],
  'acc_diff_mean': -0.0038353073886069123,
  'acc_diff_sd': 0.0067252297486844661,
  'acc_list': [0.80315848843767623,
   0.80315848843767623,
   0.8014664410603497,
   0.80259447264523409,
   0.80372250423011848,
   0.79469825155104346,
   0.80372250423011848,
   0.8014664410603497,
   0.78228990411731525,
   0.80259447264523409],
  'acc_mean': 0.79988719684151166,
  'acc_std': 0.0067252297486780008},
 451661: {'acc_diff_list': [-0.009302325581395321,
   -0.013953488372092981,
   0.013953488372093092,
   -0.013953488372092981,
   -0.009302325581395321,
   0.0046511627906976605,
   -0.013953488372092981,
   0.0,
   -0.013953488372092981,
   0.0046511627906976605],
  'acc_diff_mean': -0.0051162790697674154,
  'acc_diff_sd': 0.01015474

In [4]:
gbm_dict = {450729: {'accuracy': 0.80372250423011848,
  'baseline_accuracy': 0.702,
  'params': {'loss': 'deviance',
   'max_depth': 3,
   'max_features': 'auto',
   'min_samples_leaf': 6,
   'min_samples_split': 6,
   'n_estimators': 10,
   'random_state': 35}},
 451661: {'accuracy': 0.71627906976744182,
  'baseline_accuracy': 0.701,
  'params': {'loss': 'deviance',
   'max_depth': 3,
   'max_features': None,
   'min_samples_leaf': 9,
   'min_samples_split': 3,
   'n_estimators': 10,
   'random_state': 35}},
 493159: {'accuracy': 0.62621359223300976,
  'baseline_accuracy': 0.585,
  'params': {'loss': 'deviance',
   'max_depth': None,
   'max_features': 'auto',
   'min_samples_leaf': 3,
   'min_samples_split': 8,
   'n_estimators': 100,
   'random_state': 35}},
 501822: {'accuracy': 0.76500000000000001,
  'baseline_accuracy': 0.649,
  'params': {'loss': 'deviance',
   'max_depth': 3,
   'max_features': None,
   'min_samples_leaf': 8,
   'min_samples_split': 4,
   'n_estimators': 10,
   'random_state': 35}},
 523989: {'accuracy': 0.63148148148148153,
  'baseline_accuracy': 0.646,
  'params': {'loss': 'deviance',
   'max_depth': 3,
   'max_features': None,
   'min_samples_leaf': 8,
   'min_samples_split': 10,
   'n_estimators': 10,
   'random_state': 35}},
 527048: {'accuracy': 0.61581920903954801,
  'baseline_accuracy': 0.595,
  'params': {'loss': 'deviance',
   'max_depth': 3,
   'max_features': None,
   'min_samples_leaf': 5,
   'min_samples_split': 7,
   'n_estimators': 100,
   'random_state': 35}},
 533167: {'accuracy': 0.51193633952254647,
  'baseline_accuracy': 0.51,
  'params': {'loss': 'deviance',
   'max_depth': None,
   'max_features': 'auto',
   'min_samples_leaf': 8,
   'min_samples_split': 6,
   'n_estimators': 250,
   'random_state': 35}},
 547888: {'accuracy': 0.69648562300319494,
  'baseline_accuracy': 0.722,
  'params': {'loss': 'deviance',
   'max_depth': 3,
   'max_features': 'auto',
   'min_samples_leaf': 10,
   'min_samples_split': 6,
   'n_estimators': 50,
   'random_state': 35}},
 573204: {'accuracy': 0.66666666666666663,
  'baseline_accuracy': 0.656,
  'params': {'loss': 'deviance',
   'max_depth': None,
   'max_features': 'auto',
   'min_samples_leaf': 10,
   'min_samples_split': 2,
   'n_estimators': 10,
   'random_state': 35}},
 595307: {'accuracy': 0.64925373134328357,
  'baseline_accuracy': 0.706,
  'params': {'loss': 'deviance',
   'max_depth': 3,
   'max_features': None,
   'min_samples_leaf': 4,
   'min_samples_split': 3,
   'n_estimators': 10,
   'random_state': 35}}}

In [None]:
gbm_accuracy_dict = {}
for pitcher in pitcher_list:
    
    print 'starting pitcher', pitcher, '\n'
    
    gbm_accuracy_dict[pitcher] = {}
    
    try:
        pitcher_df = get_pitcher_df_for_modeling(cur, 
                                pitcher_id = pitcher, 
                                date_subsetting = False)
    except:
        # Establish a connection to the redshift database
        conn = create_rs_conn(config=REDSHIFT_CONFIG)
        cur = conn.cursor()

        #Retry on the pitchers
        pitcher_df = get_pitcher_df_for_modeling(cur, 
                                pitcher_id = pitcher, 
                                date_subsetting = False)

    #Sort the dataframe and create a 90-10 split by date
    subset_date = str(pitcher_df['date'].quantile(.9))[:10]
    modeling_data = split_test_train(pitcher_df, subset_date)

    #Subset down to the columns of interest
    baseline_dict = subset_data(modeling_data, cols_of_interest)

    accuracy_list = []
    diff_from_best = []
    print 'naive_accuracy:', gbm_dict[pitcher]['baseline_accuracy']
    print 'best_accuracy:', gbm_dict[pitcher]['accuracy']
    gbm_accuracy_dict[pitcher]['best_acc'] = gbm_dict[pitcher]['baseline_accuracy']
    gbm_accuracy_dict[pitcher]['naive_acc'] = gbm_dict[pitcher]['accuracy']
    
    for new_pitcher in pitcher_list:
        rf_test = GradientBoostingClassifier(min_samples_leaf= gbm_dict[new_pitcher]['params']['min_samples_leaf'], 
                                         min_samples_split=gbm_dict[new_pitcher]['params']['min_samples_split'],
                                         n_estimators=gbm_dict[new_pitcher]['params']['n_estimators'])
        new_acc = run_classifier(rf_test, baseline_dict)
        accuracy_list.append(new_acc)
        diff_from_best.append(new_acc - gbm_dict[pitcher]['accuracy'])

    print 'Acc. list:', accuracy_list
    print 'Acc. Mean:', pd.Series(accuracy_list).mean()
    print 'Acc. SD:', pd.Series(accuracy_list).std()
    print 'Acc. diff list:', diff_from_best
    print 'Acc. diff Mean:', pd.Series(diff_from_best).mean()
    print 'Acc. diff SD:', pd.Series(diff_from_best).std()
    
    gbm_accuracy_dict[pitcher]['acc_list'] = accuracy_list
    gbm_accuracy_dict[pitcher]['acc_mean'] = pd.Series(accuracy_list).mean()
    gbm_accuracy_dict[pitcher]['acc_std'] = pd.Series(accuracy_list).std()
    gbm_accuracy_dict[pitcher]['acc_diff_list'] = diff_from_best
    gbm_accuracy_dict[pitcher]['acc_diff_mean'] = pd.Series(diff_from_best).mean()
    gbm_accuracy_dict[pitcher]['acc_diff_sd'] = pd.Series(diff_from_best).std()

In [6]:
log_dict = {450729: {'accuracy': 0.80259447264523409,
  'baseline_accuracy': 0.702,
  'params': {'C': 0.1, 'penalty': 'l1', 'random_state': 35}},
 451661: {'accuracy': 0.71627906976744182,
  'baseline_accuracy': 0.701,
  'params': {'C': 0.1, 'penalty': 'l1', 'random_state': 35}},
 493159: {'accuracy': 0.62135922330097082,
  'baseline_accuracy': 0.585,
  'params': {'C': 0.1, 'penalty': 'l1', 'random_state': 35}},
 501822: {'accuracy': 0.78000000000000003,
  'baseline_accuracy': 0.649,
  'params': {'C': 0.1, 'penalty': 'l1', 'random_state': 35}},
 523989: {'accuracy': 0.63148148148148153,
  'baseline_accuracy': 0.646,
  'params': {'C': 0.1, 'penalty': 'l1', 'random_state': 35}},
 527048: {'accuracy': 0.596045197740113,
  'baseline_accuracy': 0.595,
  'params': {'C': 7.0, 'penalty': 'l1', 'random_state': 35}},
 533167: {'accuracy': 0.61273209549071617,
  'baseline_accuracy': 0.51,
  'params': {'C': 1.0, 'penalty': 'l2', 'random_state': 35}},
 547888: {'accuracy': 0.69009584664536738,
  'baseline_accuracy': 0.722,
  'params': {'C': 0.1, 'penalty': 'l2', 'random_state': 35}},
 573204: {'accuracy': 0.66666666666666663,
  'baseline_accuracy': 0.656,
  'params': {'C': 0.5, 'penalty': 'l1', 'random_state': 35}},
 595307: {'accuracy': 0.58582089552238803,
  'baseline_accuracy': 0.706,
  'params': {'C': 1.0, 'penalty': 'l2', 'random_state': 35}}}

In [None]:
log_accuracy_dict = {}
for pitcher in pitcher_list:
    
    print 'starting pitcher', pitcher, '\n'
    
    log_accuracy_dict[pitcher] = {}
    
    try:
        pitcher_df = get_pitcher_df_for_modeling(cur, 
                                pitcher_id = pitcher, 
                                date_subsetting = False)
    except:
        # Establish a connection to the redshift database
        conn = create_rs_conn(config=REDSHIFT_CONFIG)
        cur = conn.cursor()

        #Retry on the pitchers
        pitcher_df = get_pitcher_df_for_modeling(cur, 
                                pitcher_id = pitcher, 
                                date_subsetting = False)

    #Sort the dataframe and create a 90-10 split by date
    subset_date = str(pitcher_df['date'].quantile(.9))[:10]
    modeling_data = split_test_train(pitcher_df, subset_date)

    #Subset down to the columns of interest
    baseline_dict = subset_data(modeling_data, cols_of_interest)

    accuracy_list = []
    diff_from_best = []
    print 'naive_accuracy:', log_dict[pitcher]['baseline_accuracy']
    print 'best_accuracy:', log_dict[pitcher]['accuracy']
    log_accuracy_dict[pitcher]['best_acc'] = log_dict[pitcher]['baseline_accuracy']
    log_accuracy_dict[pitcher]['naive_acc'] = log_dict[pitcher]['accuracy']
    
    for new_pitcher in pitcher_list:
        rf_test = LogisticRegression(C= log_dict[new_pitcher]['params']['C'], 
                                         penalty=log_dict[new_pitcher]['params']['penalty'],
                                         random_state = 35)
        new_acc = run_classifier(rf_test, baseline_dict)
        accuracy_list.append(new_acc)
        diff_from_best.append(new_acc - log_dict[pitcher]['accuracy'])

    print 'Acc. list:', accuracy_list
    print 'Acc. Mean:', pd.Series(accuracy_list).mean()
    print 'Acc. SD:', pd.Series(accuracy_list).std()
    print 'Acc. diff list:', diff_from_best
    print 'Acc. diff Mean:', pd.Series(diff_from_best).mean()
    print 'Acc. diff SD:', pd.Series(diff_from_best).std()
    
    log_accuracy_dict[pitcher]['acc_list'] = accuracy_list
    log_accuracy_dict[pitcher]['acc_mean'] = pd.Series(accuracy_list).mean()
    log_accuracy_dict[pitcher]['acc_std'] = pd.Series(accuracy_list).std()
    log_accuracy_dict[pitcher]['acc_diff_list'] = diff_from_best
    log_accuracy_dict[pitcher]['acc_diff_mean'] = pd.Series(diff_from_best).mean()
    log_accuracy_dict[pitcher]['acc_diff_sd'] = pd.Series(diff_from_best).std()

In [1]:
lin_svc_dict = {450729: {'accuracy': 0.80428652002256062,
  'baseline_accuracy': 0.702,
  'params': {'C': 1.0, 'dual': False, 'penalty': 'l2', 'random_state': 35}},
 451661: {'accuracy': 0.71627906976744182,
  'baseline_accuracy': 0.701,
  'params': {'C': 0.1, 'dual': False, 'penalty': 'l2', 'random_state': 35}},
 493159: {'accuracy': 0.64563106796116509,
  'baseline_accuracy': 0.585,
  'params': {'C': 0.1, 'dual': False, 'penalty': 'l1', 'random_state': 35}},
 501822: {'accuracy': 0.78500000000000003,
  'baseline_accuracy': 0.649,
  'params': {'C': 1.0, 'dual': False, 'penalty': 'l1', 'random_state': 35}},
 523989: {'accuracy': 0.62592592592592589,
  'baseline_accuracy': 0.646,
  'params': {'C': 0.1, 'dual': False, 'penalty': 'l1', 'random_state': 35}},
 527048: {'accuracy': 0.58757062146892658,
  'baseline_accuracy': 0.595,
  'params': {'C': 5.0, 'dual': False, 'penalty': 'l2', 'random_state': 35}},
 533167: {'accuracy': 0.60742705570291777,
  'baseline_accuracy': 0.51,
  'params': {'C': 0.1, 'dual': False, 'penalty': 'l1', 'random_state': 35}},
 547888: {'accuracy': 0.68690095846645371,
  'baseline_accuracy': 0.722,
  'params': {'C': 0.1, 'dual': False, 'penalty': 'l1', 'random_state': 35}},
 573204: {'accuracy': 0.66666666666666663,
  'baseline_accuracy': 0.656,
  'params': {'C': 2.0, 'dual': False, 'penalty': 'l1', 'random_state': 35}},
 595307: {'accuracy': 0.58208955223880599,
  'baseline_accuracy': 0.706,
  'params': {'C': 0.1, 'dual': False, 'penalty': 'l2', 'random_state': 35}}}

In [None]:
lin_svc_accuracy_dict = {}
for pitcher in pitcher_list:
    
    print 'starting pitcher', pitcher, '\n'
    
    lin_svc_accuracy_dict[pitcher] = {}
    
    try:
        pitcher_df = get_pitcher_df_for_modeling(cur, 
                                pitcher_id = pitcher, 
                                date_subsetting = False)
    except:
        # Establish a connection to the redshift database
        conn = create_rs_conn(config=REDSHIFT_CONFIG)
        cur = conn.cursor()

        #Retry on the pitchers
        pitcher_df = get_pitcher_df_for_modeling(cur, 
                                pitcher_id = pitcher, 
                                date_subsetting = False)

    #Sort the dataframe and create a 90-10 split by date
    subset_date = str(pitcher_df['date'].quantile(.9))[:10]
    modeling_data = split_test_train(pitcher_df, subset_date)

    #Subset down to the columns of interest
    baseline_dict = subset_data(modeling_data, cols_of_interest)

    accuracy_list = []
    diff_from_best = []
    print 'naive_accuracy:', lin_svc_dict[pitcher]['baseline_accuracy']
    print 'best_accuracy:', lin_svc_dict[pitcher]['accuracy']
    lin_svc_accuracy_dict[pitcher]['best_acc'] = lin_svc_dict[pitcher]['baseline_accuracy']
    lin_svc_accuracy_dict[pitcher]['naive_acc'] = lin_svc_dict[pitcher]['accuracy']
    
    for new_pitcher in pitcher_list:
        rf_test = LinearSVC(C= lin_svc_dict[new_pitcher]['params']['C'], 
                                         penalty=lin_svc_dict[new_pitcher]['params']['penalty'],
                                         dual = False,
                                         random_state = 35)
        new_acc = run_classifier(rf_test, baseline_dict)
        accuracy_list.append(new_acc)
        diff_from_best.append(new_acc - lin_svc_dict[pitcher]['accuracy'])

    print 'Acc. list:', accuracy_list
    print 'Acc. Mean:', pd.Series(accuracy_list).mean()
    print 'Acc. SD:', pd.Series(accuracy_list).std()
    print 'Acc. diff list:', diff_from_best
    print 'Acc. diff Mean:', pd.Series(diff_from_best).mean()
    print 'Acc. diff SD:', pd.Series(diff_from_best).std()
    
    lin_svc_accuracy_dict[pitcher]['acc_list'] = accuracy_list
    lin_svc_accuracy_dict[pitcher]['acc_mean'] = pd.Series(accuracy_list).mean()
    lin_svc_accuracy_dict[pitcher]['acc_std'] = pd.Series(accuracy_list).std()
    lin_svc_accuracy_dict[pitcher]['acc_diff_list'] = diff_from_best
    lin_svc_accuracy_dict[pitcher]['acc_diff_mean'] = pd.Series(diff_from_best).mean()
    lin_svc_accuracy_dict[pitcher]['acc_diff_sd'] = pd.Series(diff_from_best).std()

In [53]:
def run_randomsearch_classifier(classifier, data_dict):
    """Given a classifier and a data dictionary containing 'train_data' and 'test_data' (as pandas DFs),
    This runs the classifier and outputs the accuracy of the classifier on the test data."""
    
    # Fit a model on all the data and features
    classifier.fit(data_dict['train_data'], data_dict['train_targets'])
    
    #print the best parameters
    print classifier.best_params_

    # Make predictions on dev data
    dev_predictions = classifier.predict(data_dict['test_data'])
    
    print accuracy_score(data_dict['test_targets'], dev_predictions)
    
    # Return the dev performance score.
    return dev_predictions

Hyperparameters don't really matter. We can use a common set for these classifiers. Next up: writing a function that tries all different ensembles and chooses the best combo.

In [15]:
pitcher = 595307
#Get the pitchers data
try:
    pitcher_df = get_pitcher_df_for_modeling(cur, 
                            pitcher_id = pitcher, 
                            date_subsetting = False)
except:
    # Establish a connection to the redshift database
    conn = create_rs_conn(config=REDSHIFT_CONFIG)
    cur = conn.cursor()

    #Retry on the pitchers
    pitcher_df = get_pitcher_df_for_modeling(cur, 
                            pitcher_id = pitcher, 
                            date_subsetting = False)

#Sort the dataframe and create a 90-10 split by date
subset_date = str(pitcher_df['date'].quantile(.9))[:10]
modeling_data = split_test_train(pitcher_df, subset_date)

#Subset down to the columns of interest
baseline_dict = subset_data(modeling_data, cols_of_interest)

In [16]:
def run_all_classifiers(data_dict):
    '''Takes in a modeling dictionary and runs the following classifiers:
    - Random Forest
    - Gradient Boosted Machine
    - Logistic Regression
    - Linear Support Vector Machine
    Returns a dictionary with these four trained models'''
    
    #Initialize a dictionary to hold all the classifiers
    classifier_dict = {}
    classifier_dict['rf'] = (RandomForestClassifier(max_depth=3,
                                                   min_samples_leaf = 7,
                                                   min_samples_split = 6,
                                                   n_estimators = 350)
                             .fit(data_dict['train_data'], data_dict['train_targets']))
    classifier_dict['gbm'] = (GradientBoostingClassifier(max_depth=3,
                                                         loss = 'deviance',
                                                         max_features = 'auto')
                              .fit(data_dict['train_data'], data_dict['train_targets']))
    classifier_dict['log_reg'] = (LogisticRegression(C = 0.1,
                                                penalty = 'l1')
                             .fit(data_dict['train_data'], data_dict['train_targets']))
    classifier_dict['lin_svc'] = (LinearSVC(C = 0.1,
                                            penalty = 'l1',
                                            dual = False)
                                  .fit(data_dict['train_data'], data_dict['train_targets']))
    
    return classifier_dict

In [21]:
naive_accuracy2(baseline_dict)

0.706

In [22]:
naive_accuracy(baseline_dict)

0.642

In [17]:
class_dict = run_all_classifiers(baseline_dict)

In [25]:
accuracy_score(baseline_dict['test_targets'], class_dict['rf'].predict(baseline_dict['test_data']))

0.64552238805970152

In [26]:
def collect_classifier_predictions2(data_dict, classifier_dict):
    """Given a data dictionary  containing 'train_data' and 'test_data' (as pandas DFs) and classifiers (kwargs),
    This runs the classifier and outputs the predictions of each classifier as a dictionary.
    Input:
        data_dict: the data dictionary containing all the train/test data/targets
        classifier_dict: dictionary of trained classifiers
    Output:
        dictionary of predictions where the key is the classifier label given in kwargs and the value is a list of predictions"""
    
    pred_dict = {}
    for classifier in classifier_dict.keys():

        # Make predictions on dev data
        pred_dict[classifier] = classifier_dict[classifier].predict(data_dict['test_data'])
    
    # Return the dev performance score.
    return pred_dict

In [27]:
prediction_dict = collect_classifier_predictions2(baseline_dict, class_dict)

In [50]:
from itertools import combinations
def choose_best_ensemble(pred_dict, modeling_dict):
    
    #initialize best accuracy
    best_accuracy = 0
    
    #Try each of the classifiers individually
    for classifier in pred_dict:
        new_acc = accuracy_score(modeling_dict['test_targets'], pred_dict[classifier])
        
        if new_acc > best_accuracy:
            best_accuracy = new_acc
            classifier_combo = classifier
    
    # Using at least three classifiers, try all different modeling combinations
    for i in range(3, len(pred_dict.keys()) + 1):
        
        for combo in combinations(pred_dict.keys(), i):
            
            #reformulate the pred dictionary based on the current combo
            new_dict = dict((k, pred_dict[k]) for k in combo)
            
            #Ensemble vote
            new_preds = ensemble_voting(new_dict)
            
            #Get accuracy and compare to current best
            new_acc = accuracy_score(modeling_dict['test_targets'], new_preds)
            if new_acc > best_accuracy:
                best_accuracy = new_acc
                classifier_combo = combo
    
    return {'best_acc' : best_accuracy,
            'classifier_combination' : classifier_combo}

In [51]:
choose_best_ensemble(prediction_dict, baseline_dict)

{'best_acc': 0.64552238805970152, 'classifier_combination': 'rf'}

In [39]:
test = ('hello')
type(test)

str

In [36]:
import itertools
[x for x in itertools.combinations([1,2,3])]

TypeError: Required argument 'r' (pos 2) not found

In [19]:
def naive_accuracy2(data_dict):
    biggest_count = data_dict['train_targets'].value_counts()[0]
    all_counts = data_dict['train_targets'].value_counts().sum()
    return round(float(biggest_count) / all_counts, 3)