In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
import glob
import numpy as np
import xgboost as xgb


In [2]:
cornbelters_files = glob.glob("../KCLData/*.csv")
other_files = glob.glob("../CornBeltersData/*.csv")

all_files = cornbelters_files + other_files

# Read and concatenate all CSVs into one DataFrame
stuff_plus = [pd.read_csv(f) for f in all_files]
df = pd.concat(stuff_plus, ignore_index=True)


In [29]:
dft = df[['TaggedPitchType','Pitcher','PlayResult','RelSpeed','SpinRate','RelHeight','RelSide','Extension','InducedVertBreak', 'HorzBreak','VertApprAngle','ZoneSpeed']]

# Create dummy variables for TaggedPitchType
dummies = pd.get_dummies(dft['TaggedPitchType'], prefix='PitchType',dtype=float)
dft = pd.concat([dft, dummies], axis=1)

In [30]:
dft['PlayResult'].unique()


array([nan, 'StrikeoutLooking', 'StrikeoutSwinging', 'Out', 'Single',
       'Walk', 'Error', 'Double', 'Triple', 'HomeRun', 'FieldersChoice',
       'Sacrifice', 'HItByPitch'], dtype=object)

In [31]:
dft = dft.dropna()

In [32]:
map_data = {
    'Out': -.33,
    'Single': .46,
    'Double': .79,
    'StrikeoutLooking': -.33,
    'StrikeoutSwinging': -.33,
    'Error': 0,
    'Walk': .31,
    'Triple': 1.07,
    'HomeRun': 1.41,
    'FieldersChoice': 0,
    'Sacrifice': 0,
    'HItByPitch': .33  # Fixed typo: was 'HItByPitch': .330
}

dft['PlayResult'] = dft['PlayResult'].map(map_data)

In [33]:
offspeed_pitches = ['Sinker','Curveball','Slider','Cutter','Cutter']

def calculate_ff_diff(event):
    """Calculate fastball speed difference with error handling"""
    if pd.notna(event['RelSpeed']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df[(df['Pitcher'] == event['Pitcher']) & 
                         (df['TaggedPitchType'] == 'Fastball')]['RelSpeed'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['RelSpeed']
    return np.nan

def calculate_ivb_diff(event):
    """Calculate induced vertical break difference with error handling"""
    if pd.notna(event['InducedVertBreak']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df[(df['Pitcher'] == event['Pitcher']) & 
                         (df['TaggedPitchType'] == 'Fastball')]['InducedVertBreak'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['InducedVertBreak']
    return np.nan

def calculate_hb_diff(event):
    """Calculate horizontal break difference with error handling"""
    if pd.notna(event['HorzBreak']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df[(df['Pitcher'] == event['Pitcher']) & 
                         (df['TaggedPitchType'] == 'Fastball')]['HorzBreak'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['HorzBreak']
    return np.nan

In [34]:
dft['ff_diff'] = dft.apply(calculate_ff_diff, axis=1)
dft['ivb_diff'] = dft.apply(calculate_ivb_diff, axis=1)
dft['hb_diff'] = dft.apply(calculate_hb_diff, axis=1)
    

In [35]:
dft.head()

Unnamed: 0,TaggedPitchType,Pitcher,PlayResult,RelSpeed,SpinRate,RelHeight,RelSide,Extension,InducedVertBreak,HorzBreak,...,PitchType_Curveball,PitchType_Cutter,PitchType_Fastball,PitchType_Knuckleball,PitchType_Sinker,PitchType_Slider,PitchType_Splitter,ff_diff,ivb_diff,hb_diff
6,Slider,Roy Rolston,-0.33,78.997127,2234.3547,5.643353,1.872089,6.0,6.484273,-2.228677,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6.567863,11.272762,14.542166
12,Fastball,Roy Rolston,-0.33,86.577867,2090.806727,5.238482,2.212006,6.0,17.096434,15.031649,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,,
16,Fastball,Roy Rolston,-0.33,87.530392,1987.298234,5.566592,2.257963,6.0,18.543534,12.824408,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,,
20,Slider,Devan Tupper,-0.33,80.916924,2165.574886,5.271006,2.011555,6.0,-1.006549,4.595121,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.389381,15.222346,8.92275
28,Fastball,Devan Tupper,-0.33,85.023046,1722.172115,5.837093,2.935782,6.0,14.298228,13.349795,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,,


In [36]:
features = ['RelSpeed','SpinRate','RelHeight','RelSide','Extension','InducedVertBreak', 'HorzBreak','VertApprAngle','ZoneSpeed','ff_diff','ivb_diff','hb_diff']

In [39]:
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline


In [54]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import RandomizedSearchCV, KFold
from scipy.stats import uniform, randint
from xgboost import XGBRegressor
from sklearn.datasets import make_regression
import numpy as np
from math import sqrt

X = dft[features]
y = dft['PlayResult']

# Define the pipeline with RobustScaler and XGBRegressor
pipeline = make_pipeline(
    RobustScaler(),
    XGBRegressor(objective='reg:squarederror', eval_metric='rmse', random_state=42)
)

# Define the parameter distributions for RandomizedSearchCV
# Note: Prefix parameters with 'xgbregressor__' to reference the XGBRegressor step in the pipeline
param_distributions = {
    'xgbregressor__n_estimators': randint(100, 1000),
    'xgbregressor__learning_rate': uniform(0.01, 0.29),
    'xgbregressor__max_depth': randint(3, 12),
    'xgbregressor__min_child_weight': randint(1, 10),
    'xgbregressor__subsample': uniform(0.5, 0.5),
    'xgbregressor__colsample_bytree': uniform(0.5, 0.5),
    'xgbregressor__gamma': uniform(0, 1),
    'xgbregressor__reg_alpha': uniform(0, 1),
    'xgbregressor__reg_lambda': uniform(0, 2),
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=100,  # Number of parameter settings to sample
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring='neg_root_mean_squared_error',
    verbose=2,
    random_state=42,
    n_jobs=-1,
    return_train_score=True
)

# Fit the RandomizedSearchCV object to the data
print("\nStarting RandomizedSearchCV...")
random_search.fit(X, y)

# Get the best parameters and best score
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validation RMSE: {sqrt(-random_search.best_score_)}")

# Optional: Access the best model and make predictions
best_model = random_search.best_estimator_
# Example prediction
y_pred = best_model.predict(X[:5])
print("Predictions for first 5 samples:", y_pred)


Starting RandomizedSearchCV...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found: {'xgbregressor__colsample_bytree': np.float64(0.9593752611303732), 'xgbregressor__gamma': np.float64(0.7900846371240867), 'xgbregressor__learning_rate': np.float64(0.01667655538841488), 'xgbregressor__max_depth': 3, 'xgbregressor__min_child_weight': 9, 'xgbregressor__n_estimators': 150, 'xgbregressor__reg_alpha': np.float64(0.7364442356247228), 'xgbregressor__reg_lambda': np.float64(1.8687340295380297), 'xgbregressor__subsample': np.float64(0.9627842564533882)}
Best cross-validation RMSE: 0.6424839796018763
Predictions for first 5 samples: [-0.07446239 -0.02097097 -0.00362801 -0.01083122  0.00539356]


In [55]:
xgb_model = xgb.XGBRegressor(**random_search.best_params_,)
xgb_model.fit(X, y)


Parameters: { "xgbregressor__colsample_bytree", "xgbregressor__gamma", "xgbregressor__learning_rate", "xgbregressor__max_depth", "xgbregressor__min_child_weight", "xgbregressor__n_estimators", "xgbregressor__reg_alpha", "xgbregressor__reg_lambda", "xgbregressor__subsample" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
import numpy as np
import pandas as pd

def predict_stuff_plus(event):
    # Define the full set of features used in training (12 features, based on error expecting 12)
    required_features = ['RelSpeed', 'SpinRate', 'RelHeight', 'RelSide', 'Extension', 
                        'InducedVertBreak', 'HorzBreak', 'VertApprAngle', 'ZoneSpeed', 
                        'ff_diff', 'ivb_diff', 'hb_diff']
    
    # Check if all required features are present; fill missing with 0 for fastballs
    input_data = {}
    for f in required_features:
        if f in event and not pd.isna(event[f]):
            input_data[f] = event[f]
        else:
            # For fastballs, set diff features to 0 if missing
            if event['TaggedPitchType'] == 'Fastball' and f in ['ff_diff', 'ivb_diff', 'hb_diff']:
                input_data[f] = 0
            else:
                return np.nan  # Missing critical feature
    
    # Convert to DataFrame for pipeline compatibility
    input_df = pd.DataFrame([input_data], columns=required_features)
    
    # Predict using the trained model (pipeline handles scaling)
    pred = xgb_model.predict(input_df)[0]
    
    # Apply inverse sigmoid transformation to compress to (0, 1), where higher pred -> lower Stuff+
    k = 2.0  # Steepness parameter (adjusted for xRV range)
    xrv_mean = y.mean() if hasattr(y, 'mean') else 0.28  # Fallback to approximate xRV mean
    sigmoid = 1 / (1 + np.exp(k * (pred - xrv_mean)))  # Note: Removed negative sign to invert
    
    # Scale to 50-150 (lower pred -> higher Stuff+, higher pred -> lower Stuff+)
    scaled_pred = 50 + (150 - 50) * sigmoid
    
    return scaled_pred

In [47]:

cornbelters_files = glob.glob("../CornBeltersData/*.csv")
kcl_files = glob.glob("../kclData/*.csv")


# Read and concatenate all CSVs into one DataFrame
cornbelters  = [pd.read_csv(f) for f in cornbelters_files]
kcl = [pd.read_csv(f) for f in kcl_files]
kcl_df = pd.concat(kcl, ignore_index=True)
cornbelters_df = pd.concat(cornbelters, ignore_index=True)

In [57]:
offspeed_pitches = ['Sinker','Curveball','Slider','Cutter','Cutter']

def calculate_ff_diff(event):
    """Calculate fastball speed difference with error handling"""
    if pd.notna(event['RelSpeed']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = kcl_df[(kcl_df['Pitcher'] == event['Pitcher']) & 
                         (kcl_df['TaggedPitchType'] == 'Fastball')]['RelSpeed'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['RelSpeed']
    return np.nan

def calculate_ivb_diff(event):
    """Calculate induced vertical break difference with error handling"""
    if pd.notna(event['InducedVertBreak']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = kcl_df[(kcl_df['Pitcher'] == event['Pitcher']) & 
                         (kcl_df['TaggedPitchType'] == 'Fastball')]['InducedVertBreak'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['InducedVertBreak']
    return np.nan

def calculate_hb_diff(event):
    """Calculate horizontal break difference with error handling"""
    if pd.notna(event['HorzBreak']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = kcl_df[(kcl_df['Pitcher'] == event['Pitcher']) & 
                         (kcl_df['TaggedPitchType'] == 'Fastball')]['HorzBreak'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['HorzBreak']
    return np.nan

In [58]:
kcl_df['ff_diff'] = kcl_df.apply(calculate_ff_diff, axis=1)
kcl_df['ivb_diff'] = kcl_df.apply(calculate_ivb_diff, axis=1)
kcl_df['hb_diff'] = kcl_df.apply(calculate_hb_diff, axis=1)

In [70]:
kcl_df['Stuff+'] = kcl_df.apply(predict_stuff_plus,axis=1)

In [71]:
pitcher_stuffplus = kcl_df.groupby('Pitcher')['Stuff+'].mean().reset_index()
pitcher_stuffplus.to_csv('stuff+.csv')

In [15]:
import pickle

In [17]:
with open("stuff_plus_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)