In [1]:
import psycopg2
import sys  

import numpy as np
import pandas as pd

from sklearn import cross_validation

from config import REDSHIFT_CONFIG
from src.features import *
from src.utils import *
from src.validation import *

import sys
# reload(sys)
# sys.setdefaultencoding('utf8')

In [2]:
# Establish a connection to the redshift database
conn = create_rs_conn(config=REDSHIFT_CONFIG)
cur = conn.cursor()

In [3]:
raw_query = """SELECT * \
FROM all_pitch_data \
WHERE pitcher = '434378'"""

sample_header, sample_rows = run_rs_query(cur, raw_query)
sample_df = pd.DataFrame(sample_rows)
sample_df.columns = sample_header
# sample_df.head()

In [32]:
cur.execute("""rollback;""")

In [7]:
def pitcher_batter_priors(df):
    """Given a pandas dataframe containing all pitches for a single pitcher, 
    returns the dataframe with pitcher_batter priors and pitcher_batter shrunk priors
    """
    
    beta = 4.0
    
    # Make store for pitches
    # pitch_stores has dict for each batter, global is across batters
    pitch_store = {}
    global_pitch_store = {}

    # Make dicts to hold output columns
    pb_priors = {}
    pb_priors_shrunk = {}
    
    pitch_list = df['pitch_type'].unique()
    
    # Make empty lists in dicts 
    for pitch in pitch_list:
        pb_priors[pitch] = []
        pb_priors_shrunk[pitch] = []

    # Initialize global_pitch_store
    for pitch in pitch_list:
        global_pitch_store[pitch] = 0
    global_pitch_store['total'] = 0
    
    # Iterate through dataframe, updating stores and making features
    df = df.sort(['game_id','num','id'])
    for index, row in df.iterrows():
        cur_batter = row['batter']
        pitch_type = row['pitch_type']

        ### Write priors to output lists ###
        
        if not cur_batter in pitch_store:
            pitch_store[cur_batter] = dict(total = 0)

        # If batter has never been seen before, 
        # append 0 (non-shrunk) or global averages (shrunk)
        cur_batter_total = pitch_store[cur_batter]['total']
        global_pitch_store_total = global_pitch_store['total']
        
        if cur_batter_total == 0:
            # If it's the first pitch in the dataframe
            if global_pitch_store_total == 0:
                for ptype in pb_priors:
                    pb_priors[ptype].append(0.0)
                    pb_priors_shrunk[ptype].append(0.0)
            else:
                for ptype in pb_priors:
                    pb_priors[ptype].append(0.0)
                    pb_priors_shrunk[ptype].append(
                        global_pitch_store[ptype] / (global_pitch_store_total*1.0)
                    )

        # If you've seen the batter before, append the past frequency (non-shrunk)
        # or the shrunk past frequency (shrunk)
        else:
            for ptype in pb_priors:
                cur_pitch_freq = pitch_store[cur_batter].get(ptype,0)
                cur_pitch_freq_global = global_pitch_store[ptype]
                
                pb_priors[ptype].append(cur_pitch_freq / (cur_batter_total*1.0))
                pb_priors_shrunk[ptype].append(

                    # Shrunk formula
                    (cur_pitch_freq * (cur_pitch_freq / (cur_batter_total * 1.0)) + # n * s +
                    beta * (cur_pitch_freq_global / (global_pitch_store_total * 1.0))) # B * p
                    / ((cur_pitch_freq + beta) * 1.0) # / n + B
                )
        
        ### Update pitch_stores with the current pitch's type ###
        
        # Update pitch_store
        if not pitch_type in pitch_store[cur_batter]:
            pitch_store[cur_batter][pitch_type] = 1
        else:
            pitch_store[cur_batter][pitch_type] += 1
        pitch_store[cur_batter]['total'] += 1            
    
        # Update global_pitch_store
        global_pitch_store[pitch_type] += 1
        global_pitch_store['total'] += 1
    
    # Add columns to dataframe
    for ptype in pb_priors:
        df[str(ptype) + "_prior"] = pb_priors[ptype]
        df[str(ptype) + "_shrunk_prior"] = pb_priors_shrunk[ptype]
    
    print df.shape, len(pb_priors['FF']),len(pb_priors_shrunk['FF']) 
    return df

In [44]:
sample_df['pitch_type'].unique()

array(['FT', 'CH', 'FF', 'SL', 'CU', 'IN', None, 'PO'], dtype=object)

In [9]:
pitcher_batter_priors(sample_df).to_csv("verlander_priors.csv")

(23143, 91) 23143 23143
