In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('cleaned_baseball.csv')

In [None]:
# Vertical Apporach Angle (degrees)
def calculate_vaa_physics(row):
    vy0 = row['vy0']  # y-velocity at 50ft (ft/s)
    ay = row['ay']    # y-acceleration (ft/s²)
    vz0 = row['vz0']  # z-velocity at 50ft (ft/s)
    az = row['az']    # z-acceleration (ft/s²)
    
    # Distance from 50ft mark to home plate (50ft)
    y0_minus_yf = 50
    
    vy_f = -np.sqrt(vy0**2 - (2 * ay * y0_minus_yf))
    t = (vy_f - vy0) / ay
    vz_f = vz0 + (az * t)
    vaa = -np.degrees(np.arctan(vz_f / vy_f))
    return round(vaa, 2)

df['VAA'] = df.apply(calculate_vaa_physics, axis=1)

# Horizontal Approach Angle (degrees)
# HAA = arctan(horizontal movement / distance from 50ft to home)
df['HAA'] = np.degrees(np.arctan(df['pfx_x'] / 50)).round(2)


# If spin_axis is stored in degrees
df['spin_axis_rad'] = np.radians(df['spin_axis'])  # Correct conversion

# Spin Efficiency (estimated)
# Spin axis adjustment: 0° = topspin, 180° = backspin, 90° = sidespin
df['spin_efficiency'] = np.abs(np.sin(df['spin_axis_rad'])).round(2)

In [5]:
df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,pitcher_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,batter_name,VAA,HAA,spin_axis_rad,spin_efficiency
0,SI,2025-03-25,98.3,-2.47,5.22,"Faherty, Jake",665998,801619,field_out,hit_into_play,...,,1.23,1.44,-1.44,,"Munguia, Ismael",-5.36,-1.65,4.101524,0.82
1,SI,2025-03-25,98.6,-2.48,5.11,"Faherty, Jake",665998,801619,,called_strike,...,,1.07,1.35,-1.35,,"Munguia, Ismael",-4.64,-1.55,4.118977,0.83
2,SI,2025-03-25,100.7,-2.27,5.29,"Faherty, Jake",672724,801619,field_out,hit_into_play,...,,1.25,1.02,1.02,,"Peraza, Oswald",-4.9,-1.17,4.049164,0.79
3,SI,2025-03-25,99.7,-2.22,5.26,"Faherty, Jake",672724,801619,,swinging_strike,...,,1.22,0.82,0.82,,"Peraza, Oswald",-3.55,-0.94,4.08407,0.81
4,SI,2025-03-25,98.6,-2.44,5.19,"Faherty, Jake",672724,801619,,swinging_strike,...,,0.99,1.11,1.11,,"Peraza, Oswald",-4.03,-1.27,4.049164,0.79


In [6]:
# Calculate overall averages across all years
league_avg_pfx_z = df.groupby('pitch_type')['pfx_z'].transform('mean')
league_avg_pfx_x = df.groupby('pitch_type')['pfx_x'].transform('mean')

# Break+
df['Vertical_Break'] = (df['pfx_z'] / league_avg_pfx_z) * 100
df['Horizontal_Break'] = (df['pfx_x'] / league_avg_pfx_x) * 100


df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,pitcher_name,batter,pitcher,events,description,...,api_break_x_arm,api_break_x_batter_in,arm_angle,batter_name,VAA,HAA,spin_axis_rad,spin_efficiency,Vertical_Break,Horizontal_Break
0,SI,2025-03-25,98.3,-2.47,5.22,"Faherty, Jake",665998,801619,field_out,hit_into_play,...,1.44,-1.44,,"Munguia, Ismael",-5.36,-1.65,4.101524,0.82,173.51781,278.507995
1,SI,2025-03-25,98.6,-2.48,5.11,"Faherty, Jake",665998,801619,,called_strike,...,1.35,-1.35,,"Munguia, Ismael",-4.64,-1.55,4.118977,0.83,196.150568,261.101245
2,SI,2025-03-25,100.7,-2.27,5.29,"Faherty, Jake",672724,801619,field_out,hit_into_play,...,1.02,1.02,,"Peraza, Oswald",-4.9,-1.17,4.049164,0.79,150.885052,197.276496
3,SI,2025-03-25,99.7,-2.22,5.26,"Faherty, Jake",672724,801619,,swinging_strike,...,0.82,0.82,,"Peraza, Oswald",-3.55,-0.94,4.08407,0.81,161.447006,158.59483
4,SI,2025-03-25,98.6,-2.44,5.19,"Faherty, Jake",672724,801619,,swinging_strike,...,1.11,1.11,,"Peraza, Oswald",-4.03,-1.27,4.049164,0.79,206.712522,214.683246


In [7]:
# Define swing-and-miss (whiff) events
whiff_events = ['swinging_strike', 'swinging_strike_blocked', 'missed_bunt']
df['is_whiff'] = df['description'].isin(whiff_events).astype(int)

# Define strikes (including fouls)
strike_events = ['called_strike', 'swinging_strike', 'foul', 'foul_tip']
df['is_strike'] = df['description'].isin(strike_events).astype(int)

df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,pitcher_name,batter,pitcher,events,description,...,arm_angle,batter_name,VAA,HAA,spin_axis_rad,spin_efficiency,Vertical_Break,Horizontal_Break,is_whiff,is_strike
0,SI,2025-03-25,98.3,-2.47,5.22,"Faherty, Jake",665998,801619,field_out,hit_into_play,...,,"Munguia, Ismael",-5.36,-1.65,4.101524,0.82,173.51781,278.507995,0,0
1,SI,2025-03-25,98.6,-2.48,5.11,"Faherty, Jake",665998,801619,,called_strike,...,,"Munguia, Ismael",-4.64,-1.55,4.118977,0.83,196.150568,261.101245,0,1
2,SI,2025-03-25,100.7,-2.27,5.29,"Faherty, Jake",672724,801619,field_out,hit_into_play,...,,"Peraza, Oswald",-4.9,-1.17,4.049164,0.79,150.885052,197.276496,0,0
3,SI,2025-03-25,99.7,-2.22,5.26,"Faherty, Jake",672724,801619,,swinging_strike,...,,"Peraza, Oswald",-3.55,-0.94,4.08407,0.81,161.447006,158.59483,1,1
4,SI,2025-03-25,98.6,-2.44,5.19,"Faherty, Jake",672724,801619,,swinging_strike,...,,"Peraza, Oswald",-4.03,-1.27,4.049164,0.79,206.712522,214.683246,1,1


In [8]:
def categorize_zone(row):
    if row['plate_z'] < 2.5:
        vertical = 'low'
    else:
        vertical = 'high'
    
    if row['plate_x'] < -0.5:
        horizontal = 'away'
    elif row['plate_x'] > 0.5:
        horizontal = 'inside'
    else:
        horizontal = 'middle'
    
    return f"{vertical}_{horizontal}"

df['zone_category'] = df.apply(categorize_zone, axis=1)

df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,pitcher_name,batter,pitcher,events,description,...,batter_name,VAA,HAA,spin_axis_rad,spin_efficiency,Vertical_Break,Horizontal_Break,is_whiff,is_strike,zone_category
0,SI,2025-03-25,98.3,-2.47,5.22,"Faherty, Jake",665998,801619,field_out,hit_into_play,...,"Munguia, Ismael",-5.36,-1.65,4.101524,0.82,173.51781,278.507995,0,0,low_away
1,SI,2025-03-25,98.6,-2.48,5.11,"Faherty, Jake",665998,801619,,called_strike,...,"Munguia, Ismael",-4.64,-1.55,4.118977,0.83,196.150568,261.101245,0,1,low_middle
2,SI,2025-03-25,100.7,-2.27,5.29,"Faherty, Jake",672724,801619,field_out,hit_into_play,...,"Peraza, Oswald",-4.9,-1.17,4.049164,0.79,150.885052,197.276496,0,0,low_middle
3,SI,2025-03-25,99.7,-2.22,5.26,"Faherty, Jake",672724,801619,,swinging_strike,...,"Peraza, Oswald",-3.55,-0.94,4.08407,0.81,161.447006,158.59483,1,1,high_middle
4,SI,2025-03-25,98.6,-2.44,5.19,"Faherty, Jake",672724,801619,,swinging_strike,...,"Peraza, Oswald",-4.03,-1.27,4.049164,0.79,206.712522,214.683246,1,1,high_away


In [9]:
# Pitcher's cumulative whiff rate (up to current game)
df['pitcher_whiff_rate'] = df.groupby(['pitcher', 'pitch_type'])['is_whiff'] \
    .transform(lambda x: x.expanding().mean().shift(1))

# Batter's weakness against pitch types (up to current game)
df['batter_whiff_rate'] = df.groupby(['batter', 'pitch_type'])['is_whiff'] \
    .transform(lambda x: x.expanding().mean().shift(1))

# League-average whiff rates
league_whiff_rates = df.groupby('pitch_type')['is_whiff'].mean().to_dict()
df['league_whiff_rate'] = df['pitch_type'].map(league_whiff_rates)

df['pitcher_whiff_rate'] = df['pitcher_whiff_rate'].fillna(df['league_whiff_rate'])
df['batter_whiff_rate'] = df['batter_whiff_rate'].fillna(df['league_whiff_rate'])

df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,pitcher_name,batter,pitcher,events,description,...,spin_axis_rad,spin_efficiency,Vertical_Break,Horizontal_Break,is_whiff,is_strike,zone_category,pitcher_whiff_rate,batter_whiff_rate,league_whiff_rate
0,SI,2025-03-25,98.3,-2.47,5.22,"Faherty, Jake",665998,801619,field_out,hit_into_play,...,4.101524,0.82,173.51781,278.507995,0,0,low_away,0.057974,0.057974,0.057974
1,SI,2025-03-25,98.6,-2.48,5.11,"Faherty, Jake",665998,801619,,called_strike,...,4.118977,0.83,196.150568,261.101245,0,1,low_middle,0.0,0.0,0.057974
2,SI,2025-03-25,100.7,-2.27,5.29,"Faherty, Jake",672724,801619,field_out,hit_into_play,...,4.049164,0.79,150.885052,197.276496,0,0,low_middle,0.0,0.057974,0.057974
3,SI,2025-03-25,99.7,-2.22,5.26,"Faherty, Jake",672724,801619,,swinging_strike,...,4.08407,0.81,161.447006,158.59483,1,1,high_middle,0.0,0.0,0.057974
4,SI,2025-03-25,98.6,-2.44,5.19,"Faherty, Jake",672724,801619,,swinging_strike,...,4.049164,0.79,206.712522,214.683246,1,1,high_away,0.25,0.5,0.057974


In [12]:
df_new = df[[
    'game_date', 'game_type', 'pitcher', 'pitcher_name',
    'batter', 'batter_name',
    'pitch_type', 'pitch_name', 'events', 'description', 'stand', 'p_throws',
    'release_speed', 'release_spin_rate', 'effective_speed', 
    'release_pos_x', 'release_pos_y', 'release_pos_z', 
    'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 
    'pfx_x', 'pfx_z',
    'plate_x', 'plate_z', 'zone', 
    'type', 'hit_location', 'bb_type', 'balls', 'strikes', 
    'hc_x', 'hc_y', 'launch_speed', 'launch_angle',
    'launch_speed_angle', 'arm_angle', 'VAA', 'HAA',
    'spin_axis', 'spin_efficiency',
    'Vertical_Break', 'Horizontal_Break',
    'is_whiff', 'is_strike', 'zone_category', 
    'pitcher_whiff_rate', 'batter_whiff_rate', 'league_whiff_rate'
]].copy()

In [13]:
df_new.head()

Unnamed: 0,game_date,game_type,pitcher,pitcher_name,batter,batter_name,pitch_type,pitch_name,events,description,...,spin_axis,spin_efficiency,Vertical_Break,Horizontal_Break,is_whiff,is_strike,zone_category,pitcher_whiff_rate,batter_whiff_rate,league_whiff_rate
0,2025-03-25,S,801619,"Faherty, Jake",665998,"Munguia, Ismael",SI,Sinker,field_out,hit_into_play,...,235.0,0.82,173.51781,278.507995,0,0,low_away,0.057974,0.057974,0.057974
1,2025-03-25,S,801619,"Faherty, Jake",665998,"Munguia, Ismael",SI,Sinker,,called_strike,...,236.0,0.83,196.150568,261.101245,0,1,low_middle,0.0,0.0,0.057974
2,2025-03-25,S,801619,"Faherty, Jake",672724,"Peraza, Oswald",SI,Sinker,field_out,hit_into_play,...,232.0,0.79,150.885052,197.276496,0,0,low_middle,0.0,0.057974,0.057974
3,2025-03-25,S,801619,"Faherty, Jake",672724,"Peraza, Oswald",SI,Sinker,,swinging_strike,...,234.0,0.81,161.447006,158.59483,1,1,high_middle,0.0,0.0,0.057974
4,2025-03-25,S,801619,"Faherty, Jake",672724,"Peraza, Oswald",SI,Sinker,,swinging_strike,...,232.0,0.79,206.712522,214.683246,1,1,high_away,0.25,0.5,0.057974


In [14]:
df_new.to_csv('final_baseball.csv', index=False)