In [4]:
import pandas as pd
import numpy as np
import pickle

# Read the Yakkertech CSV file
data_path = 'Data/2025.csv'
df = pd.read_csv(data_path)

# Define RunValue weights
Weights = {
    "Walk": 0.695, "HitByPitch": 0.727,
    "Single": 0.891, "Double": 1.269,
    "Triple": 1.609, "HomeRun": 2.078
}

# Map RunValue and fill NaN with 0
df['RunValue'] = df['PlayResult'].map(Weights)
df['RunValue'] = df['RunValue'].fillna(0)

# Load models
with open("../Stuff+/stuff_plus_model.pkl", "rb") as f:
    stuff_plus_model = pickle.load(f)

with open("../xBA_CornBelters/xba_model.pkl", "rb") as f:
    xba_model = pickle.load(f)

with open("../CornBelters/xwoba.pkl", "rb") as f:
    xwoba_model = pickle.load(f)

# Rename HitType to TaggedHitType
df = df.rename(columns={'HitType': 'TaggedHitType'})

# Define standard strike zone boundaries
vert_strike_min, vert_strike_max = 1.5, 3.5  # Standard vertical strike zone
horz_strike_min, horz_strike_max = -1, 1  # Standard horizontal strike zone
df['PlateLocSide'] = df['PlateLocSide'] * -1

# Add strike zone columns
df['Vert Strike'] = (df['PlateLocHeight'].between(vert_strike_min, vert_strike_max)).astype(int)
df['Horz Strike'] = (df['PlateLocSide'].between(horz_strike_min, horz_strike_max)).astype(int)

# Add pitch outcome columns
df['Strike?'] = (df['PitchCall'] == 'StrikeCalled').astype(int)
df['Foul?'] = (df['PitchCall'] == 'FoulBall').astype(int)
df['In Play?'] = (df['PitchCall'] == 'InPlay').astype(int)
df['Swing Strike?'] = (df['PitchCall'] == 'StrikeSwinging').astype(int)
df['Swing?'] = (df['PitchCall'].isin(['StrikeSwinging', 'FoulBall', 'InPlay'])).astype(int)
df['Ball Called?'] = (df['PitchCall'] == 'BallCalled').astype(int)
df['First Pitch'] = (df['PitchofPA'] == 1).astype(int)

# Add strike zone logic columns
df['In Strike Zone?'] = (df['Vert Strike'] & df['Horz Strike']).astype(int)
df['Chase?'] = (df['Swing?'] & ~df['In Strike Zone?']).astype(int)
df['In-zone take'] = ((df['PitchCall'].isin(['StrikeCalled', 'BallCalled'])) & df['In Strike Zone?']).astype(int)
df['In-zone whiff'] = ((df['PitchCall'] == 'StrikeSwinging') & df['In Strike Zone?']).astype(int)
df['In-zone/swing'] = (df['Swing?'] & df['In Strike Zone?']).astype(int)

# Add batted ball columns
df['LA<10'] = ((df['Angle'] < 10) & (df['PitchCall'] == 'InPlay')).astype(int)
df['Ground Ball?'] = ((df['TaggedHitType'] == 'GroundBall') | ((df['Angle'] < 10) & (df['PitchCall'] == 'InPlay'))).astype(int)
df['Fly Ball?'] = ((df['TaggedHitType'] == 'FlyBall') | ((df['Angle'] > 25) & (df['PitchCall'] == 'InPlay'))).astype(int)
df['EV>90'] = ((df['ExitSpeed'] > 90) & (df['PitchCall'] == 'InPlay')).astype(int)
df['EV>100'] = ((df['ExitSpeed'] > 100) & (df['PitchCall'] == 'InPlay')).astype(int)
df['EV>105'] = ((df['ExitSpeed'] > 105) & (df['PitchCall'] == 'InPlay')).astype(int)

# Compute diff features for off-speed pitches
fastball_avg = df[df['TaggedPitchType'] == 'Fastball'][['RelSpeed', 'InducedVertBreak', 'HorzBreak']].mean()
df['ff_diff'] = np.where(df['TaggedPitchType'] != 'Fastball', df['RelSpeed'] - fastball_avg['RelSpeed'], 0)
df['ivb_diff'] = np.where(df['TaggedPitchType'] != 'Fastball', df['InducedVertBreak'] - fastball_avg['InducedVertBreak'], 0)
df['hb_diff'] = np.where(df['TaggedPitchType'] != 'Fastball', df['HorzBreak'] - fastball_avg['HorzBreak'], 0)

# Define the new predict_stuff_plus function
def predict_stuff_plus(event, model, y):
    # Define the full set of features used in training (12 features, based on model expecting 12)
    required_features = ['RelSpeed', 'SpinRate', 'RelHeight', 'RelSide', 'Extension', 
                        'InducedVertBreak', 'HorzBreak', 'VertApprAngle', 'ZoneSpeed', 
                        'ff_diff', 'ivb_diff', 'hb_diff']
    
    # Check if all required features are present; fill missing with 0 for fastballs
    input_data = {}
    for f in required_features:
        if f in event and not pd.isna(event[f]):
            input_data[f] = event[f]
        else:
            # For fastballs, diff features are already 0 in df; for others, return NaN if missing
            if f not in ['ff_diff', 'ivb_diff', 'hb_diff']:
                return np.nan  # Missing critical feature
    
    # Convert to DataFrame for pipeline compatibility
    input_df = pd.DataFrame([input_data], columns=required_features)
    
    # Predict using the trained model (pipeline handles scaling)
    pred = model.predict(input_df)[0]
    
    # Apply inverse sigmoid transformation to compress to (0, 1), where higher pred -> lower Stuff+
    k = 2.0  # Steepness parameter (adjusted for xRV range)
    xrv_mean = y.mean() if hasattr(y, 'mean') else 0.5297397769516728  # Fallback to provided mean
    sigmoid = 1 / (1 + np.exp(k * (pred - xrv_mean)))  # Inverted sigmoid
    
    # Scale to 50-150 (lower pred -> higher Stuff+, higher pred -> lower Stuff+)
    scaled_pred = 50 + (150 - 50) * sigmoid
    
    return scaled_pred

# Define predict_xba (unchanged)
def predict_xba(event):
    if (not pd.isna(event['ExitSpeed']) and not pd.isna(event['Angle']) and event['PitchCall'] == 'InPlay'):
        return xba_model.predict([[event['ExitSpeed'], event['Angle']]])[0]
    elif event.get('KorBB', None) == "Strikeout":
        return 0
    elif event.get('KorBB', None) == "Walk":
        return np.nan  # Leave Walk as NA
    elif ((pd.isna(event['ExitSpeed']) or pd.isna(event['Angle'])) and event['PitchCall'] == "InPlay"):
        return 0  # If no batted ball data, treat as 0 for xBA
    else:
        return 0  # Default to 0 for all other cases

# Define event_predict (unchanged)
def event_predict(event):
    if (not pd.isna(event['ExitSpeed']) and not pd.isna(event['Angle']) and event['PitchCall'] == "InPlay"):
        return xwoba_model.predict([[event['ExitSpeed'], event['Angle']]])[0]
    elif event.get('KorBB', None) == "Strikeout":
        return 0
    elif event.get('KorBB', None) == "Walk":
        return 0.695
    elif ((pd.isna(event['ExitSpeed']) or pd.isna(event['Angle'])) and event['PitchCall'] == "InPlay"):
        return event['RunValue']
    return np.nan  # Default for other cases

# Apply predictions
df['Stuff+'] = df.apply(lambda row: predict_stuff_plus(row.to_dict(), stuff_plus_model, df['RunValue']), axis=1)
df['xBA'] = df.apply(predict_xba, axis=1)
df['xWOBA'] = df.apply(event_predict, axis=1)

# Add Good Swing Decision
df['Good Swing Decision'] = (df['In-zone/swing'] | (~df['Swing?'] & ~df['In Strike Zone?'])).astype(int)

# Save the modified dataframe to a new CSV
df.to_csv(data_path, index=False)

print(f'File processed and saved as {data_path} with new columns.')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


File processed and saved as Data/2025.csv with new columns.


In [1]:
import glob
import pandas as pd
kcl_files = glob.glob("../CornBeltersData/*.csv")
all_files =  kcl_files

# Read and concatenate all CSVs into one DataFrame
xba_data_list = [pd.read_csv(f) for f in all_files]
df = pd.concat(xba_data_list, ignore_index=True)
df.to_csv('Data/2025.csv', index=False)