In [2]:
import pandas as pd
import numpy as np
import pickle

# Read the Yakkertech CSV file
data_path = 'Data/splashday.csv'
df = pd.read_csv(data_path)

# Define RunValue weights
Weights = {
    "Walk": 0.695, "HitByPitch": 0.727,
    "Single": 0.891, "Double": 1.269,
    "Triple": 1.609, "HomeRun": 2.078
}

# Map RunValue and fill NaN with 0
df['RunValue'] = df['PlayResult'].map(Weights)
df['RunValue'] = df['RunValue'].fillna(0)

# Define offspeed pitches
offspeed_pitches = ['Sinker', 'Curveball', 'Slider', 'Cutter']

# Difference calculation functions
def calculate_ff_diff(event):
    """Calculate fastball speed difference with error handling"""
    if pd.notna(event['RelSpeed']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df[(df['Pitcher'] == event['Pitcher']) & 
                         (df['TaggedPitchType'] == 'Fastball')]['RelSpeed'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['RelSpeed']
    return np.nan

def calculate_ivb_diff(event):
    """Calculate induced vertical break difference with error handling"""
    if pd.notna(event['InducedVertBreak']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df[(df['Pitcher'] == event['Pitcher']) & 
                         (df['TaggedPitchType'] == 'Fastball')]['InducedVertBreak'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['InducedVertBreak']
    return np.nan

def calculate_hb_diff(event):
    """Calculate horizontal break difference with error handling"""
    if pd.notna(event['HorzBreak']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df[(df['Pitcher'] == event['Pitcher']) & 
                         (df['TaggedPitchType'] == 'Fastball')]['HorzBreak'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['HorzBreak']
    return np.nan

# Apply difference calculations
df['ff_diff'] = df.apply(calculate_ff_diff, axis=1)
df['ivb_diff'] = df.apply(calculate_ivb_diff, axis=1)
df['hb_diff'] = df.apply(calculate_hb_diff, axis=1)

# One-hot encode pitch types
# One-hot encode pitch types
dummies = pd.get_dummies(df['TaggedPitchType'], prefix='PitchType', dtype=float)
df = pd.concat([df, dummies], axis=1)

# Define trained dummy columns
trained_dummy_columns = [
    'PitchType_Changeup',
    'PitchType_Curveball',
    'PitchType_Cutter',
    'PitchType_Fastball',
    'PitchType_Knuckleball',
    'PitchType_Sinker',
    'PitchType_Slider',
    'PitchType_Splitter'
]

# Add missing dummy columns with 0s
for col in trained_dummy_columns:
    if col not in df.columns:
        df[col] = 0.0

# Drop extra dummy columns
df = df.drop(columns=[col for col in df.columns if col.startswith('PitchType_') and col not in trained_dummy_columns])

# Define model features
features = ['RelSpeed', 'SpinRate', 'RelHeight', 'RelSide', 'Extension',
            'InducedVertBreak', 'HorzBreak', 'VertApprAngle', 'ZoneSpeed',
            'ff_diff', 'ivb_diff', 'hb_diff'] + trained_dummy_columns

df = df.loc[:, ~df.columns.duplicated()]


# Load models
with open("../Stuff+/stuff_plus_model.pkl", "rb") as f:
    stuff_plus_model = pickle.load(f)

with open("../xBA_CornBelters/xba_model.pkl", "rb") as f:
    xba_model = pickle.load(f)

with open("xwoba.pkl", "rb") as f:
    xwoba_model = pickle.load(f)

# Rename HitType to TaggedHitType
df = df.rename(columns={'HitType': 'TaggedHitType'})

# Define standard strike zone boundaries
vert_strike_min, vert_strike_max = 1.5, 3.5
horz_strike_min, horz_strike_max = -1, 1
df['PlateLocSide'] = df['PlateLocSide'] * -1

# Add strike zone and outcome columns (unchanged)
df['Vert Strike'] = (df['PlateLocHeight'].between(vert_strike_min, vert_strike_max)).astype(int)
df['Horz Strike'] = (df['PlateLocSide'].between(horz_strike_min, horz_strike_max)).astype(int)
df['Strike?'] = (df['PitchCall'] == 'StrikeCalled').astype(int)
df['Foul?'] = (df['PitchCall'] == 'FoulBall').astype(int)
df['In Play?'] = (df['PitchCall'] == 'InPlay').astype(int)
df['Swing Strike?'] = (df['PitchCall'] == 'StrikeSwinging').astype(int)
df['Swing?'] = (df['PitchCall'].isin(['StrikeSwinging', 'FoulBall', 'InPlay'])).astype(int)
df['Ball Called?'] = (df['PitchCall'] == 'BallCalled').astype(int)
df['First Pitch'] = (df['PitchofPA'] == 1).astype(int)
df['In Strike Zone?'] = (df['Vert Strike'] & df['Horz Strike']).astype(int)
df['Chase?'] = (df['Swing?'] & ~df['In Strike Zone?']).astype(int)
df['In-zone take'] = ((df['PitchCall'].isin(['StrikeCalled', 'BallCalled'])) & df['In Strike Zone?']).astype(int)
df['In-zone whiff'] = ((df['PitchCall'] == 'StrikeSwinging') & df['In Strike Zone?']).astype(int)
df['In-zone/swing'] = (df['Swing?'] & df['In Strike Zone?']).astype(int)
df['LA<10'] = ((df['Angle'] < 10) & (df['PitchCall'] == 'InPlay')).astype(int)
df['Ground Ball?'] = ((df['TaggedHitType'] == 'GroundBall') | ((df['Angle'] < 10) & (df['PitchCall'] == 'InPlay'))).astype(int)
df['Fly Ball?'] = ((df['TaggedHitType'] == 'FlyBall') | ((df['Angle'] > 25) & (df['PitchCall'] == 'InPlay'))).astype(int)
df['EV>90'] = ((df['ExitSpeed'] > 90) & (df['PitchCall'] == 'InPlay')).astype(int)
df['EV>100'] = ((df['ExitSpeed'] > 100) & (df['PitchCall'] == 'InPlay')).astype(int)
df['EV>105'] = ((df['ExitSpeed'] > 105) & (df['PitchCall'] == 'InPlay')).astype(int)

# Define prediction functions (unchanged)
def predict_xba(event):
    if (not pd.isna(event['ExitSpeed']) and not pd.isna(event['Angle']) and event['PitchCall'] == 'InPlay'):
        return xba_model.predict([[event['ExitSpeed'], event['Angle']]])[0]
    elif event.get('KorBB', None) == "Strikeout":
        return 0
    elif event.get('KorBB', None) == "Walk":
        return np.nan
    elif ((pd.isna(event['ExitSpeed']) or pd.isna(event['Angle'])) and event['PitchCall'] == "InPlay"):
        return 0
    else:
        return 0

def event_predict(event):
    if (not pd.isna(event['ExitSpeed']) and not pd.isna(event['Angle']) and event['PitchCall'] == "InPlay"):
        return xwoba_model.predict([[event['ExitSpeed'], event['Angle']]])[0]
    elif event.get('KorBB', None) == "Strikeout":
        return 0
    elif event.get('KorBB', None) == "Walk":
        return 0.695
    elif ((pd.isna(event['ExitSpeed']) or pd.isna(event['Angle'])) and event['PitchCall'] == "InPlay"):
        return event['RunValue']
    return np.nan
df.to_csv('test.csv')
print(features)
def predict_stuff_plus(event):
    if all(item in event for item in features):
        # Given values
        mean = -0.013336906
        std = 0.17713484357387707
        scale_factor = 100  # Controls spread; one std moves score by 10

        # Convert event[features] to a 2D array for prediction
        input_data = event[features].values.reshape(1, -1)

        # Predict stuff_plus
        stuff_plus = stuff_plus_model.predict(input_data)[0]

        # Scale to make 100 the average
        stuff_plus_scaled = 100 + ((stuff_plus - mean) / std * scale_factor)

        return stuff_plus_scaled # Return scalar value
    else:
        return np.nan
# Apply Stuff+ prediction row-wise
df['Stuff+'] = df.apply(predict_stuff_plus, axis=1)
# Apply Stuff+ prediction row-wise
df['Stuff+'] = df.apply(predict_stuff_plus,axis=1)
# Apply predictions
df['xBA'] = df.apply(predict_xba, axis=1)
df['xWOBA'] = df.apply(event_predict, axis=1)

# Add Good Swing Decision
df['Good Swing Decision'] = (df['In-zone/swing'] | (~df['Swing?'] & ~df['In Strike Zone?'])).astype(int)

# Save the modified dataframe
df.to_csv(data_path, index=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


['RelSpeed', 'SpinRate', 'RelHeight', 'RelSide', 'Extension', 'InducedVertBreak', 'HorzBreak', 'VertApprAngle', 'ZoneSpeed', 'ff_diff', 'ivb_diff', 'hb_diff', 'PitchType_Changeup', 'PitchType_Curveball', 'PitchType_Cutter', 'PitchType_Fastball', 'PitchType_Knuckleball', 'PitchType_Sinker', 'PitchType_Slider', 'PitchType_Splitter']




In [46]:
print(len(features))

20


In [1]:
import glob
import pandas as pd
kcl_files = glob.glob("../CornBeltersData/*.csv")
all_files =  kcl_files

# Read and concatenate all CSVs into one DataFrame
xba_data_list = [pd.read_csv(f) for f in all_files]
df = pd.concat(xba_data_list, ignore_index=True)
df.to_csv('Data/2025.csv', index=False)