# Calculate Run Value from Pitch by Pitch Data

## Purpose:
### Prepare model to apply to synergy data to create similar matrix to the college baseball run environment, more specifically the CAA.

## Deriving run values from pitch by pitch data from the 2022 regular season:
### 1. Create a column containing the count-out-base state for each pitch.
### 2. Split the data into individual half-innings.
### 3. For each half-inning, create a new column that holds the difference in runs scored by the end of the half-inning compared to the beginning.
### 4. Iterate through each pitch, adding the value in the 'runs_by_end' column to the existing run value for that state and increasing the count of times that state has occurred in a dictionary of states and run values. If the state is not already in the dictionary, add it with the value of 'runs_by_end' and a count of 1.
### 5. For each state in the dictionary, divide the total run value by the number of times that state occurred to calculate the average runs scored by the end of the half-inning when that state is present.
### 6. Create a new dictionary with the states as keys and the average runs scored as values.

In [1]:
import pybaseball as pyb
import pandas as pd

In [2]:
def create_half_inning_df(group):
    # Create a new dataframe for the half-inning
    half_inning_df = pd.DataFrame(group)
        
    return half_inning_df

In [3]:
def map_state_column(df, run_expectancy_dict):
    df['run_expectancy'] = df['state'].map(run_expectancy_dict)
    return df

In [4]:
def calc_delta_run_expectancy(df):
    # Shift the values in the 'run_expectancy' column down by one row
    df['run_expectancy_shifted'] = df['run_expectancy'].shift(-1)
    # Create a new column with the difference between the shifted 'run_expectancy' and the current 'run_expectancy'
    df['delta_run_expectancy'] = df['run_expectancy_shifted'] - df['run_expectancy']
    # For the last row in the dataframe, set the 'delta_run_expectancy' to 0 - 'run_expectancy'
    df.loc[df.index[-1], 'delta_run_expectancy'] = 0 - df['run_expectancy'].iloc[-1]
    return df

In [5]:
reg_season = pyb.statcast(start_dt = '2021-04-01', end_dt = '2021-10-03')

# drop pre-made run expectancy
whole_season_df.drop(columns=['delta_run_expec'])

# reverse order so dataframe is in chronological order
reg_season = reg_season.iloc[::-1]

# drop extra inning games and bottom of the 9th games
reg_season = reg_season.loc[reg_season['inning'] <= 9]
reg_season = reg_season.drop(reg_season[(reg_season['inning_topbot'] == 'Bottom') & 
                                        (reg_season['inning'] == 9)].index)

# curently playerID or NaN, replace with 1 and 0
for col in ['on_1b', 'on_2b', 'on_3b']:
    # Replace NA values with 0
    reg_season[col] = reg_season[col].fillna(0)
    # Replace all other values with 1
    reg_season[col] = reg_season[col].where(reg_season[col] == 0, 1)
    
# make a state column with the strike ball out base state
reg_season['state'] = (
    reg_season['strikes'].astype(str) + 
    reg_season['balls'].astype(str) + 
    reg_season['outs_when_up'].astype(str) + 
    reg_season['on_1b'].astype(int).astype(str) + 
    reg_season['on_2b'].astype(int).astype(str) + 
    reg_season['on_3b'].astype(int).astype(str)
)

This is a large query, it may take a moment to complete


100%|█████████████████████████████████████████| 186/186 [02:47<00:00,  1.11it/s]


In [6]:
# Group the data by game, inning, and top/bottom of inning
half_innings = reg_season.groupby(['game_pk', 'inning', 'inning_topbot'])

# Create an empty list to store the half-inning dataframes
half_inning_dfs = []

# Iterate over the groups in the half_innings object
for name, group in half_innings:
    # Apply the create_half_inning_df function to the group
    half_inning_df = create_half_inning_df(group)
    # Append the resulting dataframe to the half_inning_dfs list
    half_inning_dfs.append(half_inning_df)

In [7]:
for df in half_inning_dfs:
    # Select the last row of the dataframe
    last_row = df.iloc[-1]

    # Get the value of the 'column_name' column in the last row
    last_row_value = last_row['post_bat_score']

    # Assign the value of 'column_name' in the last row to the entire 'column_name' column
    df['runs_by_end'] = last_row_value - df['bat_score']

In [8]:
# itterate over each pitch and record the runs by end for each state and ammount of times the state occured
for df in half_inning_dfs:
    for index, row in df.iterrows():
        if row['state'] in globals():
            globals().get(row['state'])[0] += row['runs_by_end']
            globals().get(row['state'])[1] += 1
        else:
            globals()[row['state']] = [row['runs_by_end'], 1]


In [9]:
states = set(reg_season['state'])

for state in states:
    globals()[state] = globals()[state][0]/globals()[state][1]

In [10]:
list_of_states_and_run_expectancy = []
for state in states:
    list_of_states_and_run_expectancy.append((state,globals()[state]))

In [11]:
run_expectancy_dict = dict(list_of_states_and_run_expectancy)

In [12]:
reg_season['state'] = reg_season['state'].map(run_expectancy_dict)

In [13]:
# Use the map() function to apply the map_state_column() function to each dataframe in the list
half_inning_dfs = list(map(lambda x: map_state_column(x, run_expectancy_dict), half_inning_dfs))


In [14]:
# Use the map() function to apply the calc_delta_run_expectancy() function to each dataframe in the list
half_inning_dfs = list(map(calc_delta_run_expectancy, half_inning_dfs))


In [15]:
# Concatenate the dataframes in the list into a single dataframe
whole_season_df = pd.concat(half_inning_dfs)


In [42]:
# create runs_scored column for the total runs scored after an event
whole_season_df['runs_scored'] = whole_season_df['post_bat_score'] - whole_season_df['bat_score']

# fill empty events rows with the description
whole_season_df['events'] = whole_season_df['events'].fillna(whole_season_df['description'])

# add runs scored to delta run expectancy
whole_season_df['delta_run_expectancy'] += whole_season_df['runs_scored']

In [17]:
# print average run expectancy for each event
print(whole_season_df[['events','delta_run_expectancy']].groupby('events').mean())

                            delta_run_expectancy
events                                          
ball                                    0.052429
blocked_ball                            0.062863
bunt_foul_tip                          -0.063155
called_strike                          -0.049668
catcher_interf                          0.401967
caught_stealing_2b                     -0.222726
caught_stealing_3b                     -0.387234
caught_stealing_home                   -0.402878
double                                  0.777701
double_play                            -0.925319
field_error                             0.447144
field_out                              -0.239495
fielders_choice                         0.677762
fielders_choice_out                    -0.643154
force_out                               -0.33077
foul                                   -0.037426
foul_bunt                              -0.061347
foul_pitchout                          -0.054228
foul_tip            

# Using the calculated Run Value, create xRun Value with XGBoost

In [18]:
from sklearn import metrics
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold

In [19]:
def xgboost(data, features, y_value, optimal_params = None):
    
    # define x and y data
    x = data[features]
    y = data[y_value]
    
    if optimal_params == None:

        # seperate test and training sets
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=6)

        # create list to keep scores
        score_list = []
        optimal_params = {}

        # grid search to find optimal hyperparameters
        for max_depth in range(1, 25):
            for learning_rate in [0.001, 0.01, 0.1, 1]:
                for n_estimators in [100, 250, 500, 1000]:
                    model = XGBRegressor(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators)
                    model.fit(x_train, y_train)
                    y_pred_test = model.predict(x_test)
                    score = model.score(x_test, y_test)
                    score_list.append(score)
                    params = (max_depth, learning_rate, n_estimators)
                    optimal_params[score] = params
                    print(optimal_params, score)

        optimal_score = max(score_list)
        optimal_params = optimal_params[optimal_score]
    
    # intialize regressor model with optimal hyperparameters and fit data
    model = XGBRegressor(max_depth=optimal_params[0], learning_rate=optimal_params[1], n_estimators=optimal_params[2])
    model.fit(x, y)

    # return the model
    return model, optimal_params


In [20]:
ff_df = whole_season_df.loc[whole_season_df['pitch_type'] == 'FF']

In [21]:
features = ['release_speed', 'release_pos_x', 'release_pos_z', 'pfx_x', 'pfx_z', 
            'plate_x', 'plate_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'effective_speed',
            'release_spin_rate', 'release_extension', 'release_pos_y', 'spin_axis']

In [22]:
# optimal_params comes out to (3, 0.01, 1000)

model, optimal_params = xgboost(ff_df, features, 'delta_run_expectancy', optimal_params = (3, 0.01, 1000))

In [23]:
ff_df.loc[:,'xrun_value'] = model.predict(ff_df.loc[:,features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [24]:
y_true = ff_df['delta_run_expectancy'].astype(float)
y_pred = ff_df['xrun_value'].astype(float)

score = metrics.r2_score(y_true, y_pred)
score

0.025281763288097836

###### Evidently this model needs a lot of work.