In [1]:
import glob
import pandas as pd
kcl_files = glob.glob("../CornBeltersData/*.csv")
cornbelters_files = glob.glob("../CornBeltersData/*.csv")
all_files =  kcl_files + cornbelters_files

# Read and concatenate all CSVs into one DataFrame
data_list = [pd.read_csv(f) for f in all_files]
df = pd.concat(data_list, ignore_index=True)


In [None]:
import pandas as pd
import numpy as np
import pickle


# Define offspeed pitches
offspeed_pitches = ['Sinker', 'Curveball', 'Slider', 'Cutter']

# Difference calculation functions
def calculate_ff_diff(event):
    """Calculate fastball speed difference with error handling"""
    if pd.notna(event['RelSpeed']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df[(df['Pitcher'] == event['Pitcher']) & 
                         (df['TaggedPitchType'] == 'Fastball')]['RelSpeed'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['RelSpeed']
    return np.nan

def calculate_ivb_diff(event):
    """Calculate induced vertical break difference with error handling"""
    if pd.notna(event['InducedVertBreak']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df[(df['Pitcher'] == event['Pitcher']) & 
                         (df['TaggedPitchType'] == 'Fastball')]['InducedVertBreak'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['InducedVertBreak']
    return np.nan

def calculate_hb_diff(event):
    """Calculate horizontal break difference with error handling"""
    if pd.notna(event['HorzBreak']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df[(df['Pitcher'] == event['Pitcher']) & 
                         (df['TaggedPitchType'] == 'Fastball')]['HorzBreak'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['HorzBreak']
    return np.nan

# Apply difference calculations
df['ff_diff'] = df.apply(calculate_ff_diff, axis=1)
df['ivb_diff'] = df.apply(calculate_ivb_diff, axis=1)
df['hb_diff'] = df.apply(calculate_hb_diff, axis=1)

# One-hot encode pitch types
# One-hot encode pitch types
dummies = pd.get_dummies(df['TaggedPitchType'], prefix='PitchType', dtype=float)
df = pd.concat([df, dummies], axis=1)

# Define trained dummy columns
trained_dummy_columns = [
    'PitchType_Changeup',
    'PitchType_Curveball',
    'PitchType_Cutter',
    'PitchType_Fastball',
    'PitchType_Knuckleball',
    'PitchType_Sinker',
    'PitchType_Slider',
    'PitchType_Splitter'
]

# Add missing dummy columns with 0s
for col in trained_dummy_columns:
    if col not in df.columns:
        df[col] = 0.0

# Drop extra dummy columns
df = df.drop(columns=[col for col in df.columns if col.startswith('PitchType_') and col not in trained_dummy_columns])

# Define model features
features = ['RelSpeed', 'SpinRate', 'RelHeight', 'RelSide', 'Extension',
            'InducedVertBreak', 'HorzBreak', 'VertApprAngle', 'ZoneSpeed',
            'ff_diff', 'ivb_diff', 'hb_diff'] + trained_dummy_columns

df = df.loc[:, ~df.columns.duplicated()]


# Load models
with open("../Stuff+/stuff_plus_model.pkl", "rb") as f:
    stuff_plus_model = pickle.load(f)


def predict_stuff_plus(event):
    if all(item in event for item in features):
        # Given values
        mean = -0.013336906
        std = 0.17713484357387707
        scale_factor = 100  # Controls spread; one std moves score by 10

        # Convert event[features] to a 2D array for prediction
        input_data = event[features].values.reshape(1, -1)

        # Predict stuff_plus
        stuff_plus = stuff_plus_model.predict(input_data)[0]

        # Scale to make 100 the average
        stuff_plus_scaled = 100 + ((stuff_plus - mean) / std * scale_factor)

        return stuff_plus_scaled # Return scalar value
    else:
        return np.nan
# Apply Stuff+ prediction row-wise
df['Stuff+'] = df.apply(predict_stuff_plus, axis=1)

# Add Good Swing Decision
# Save the modified dataframe




OSError: Cannot save file into a non-existent directory: 'paper'

In [3]:
data_path = "stuffdata.csv"
df.to_csv(data_path, index=False)