In [21]:
import pandas as pd
import numpy as np
import pickle
import glob
import seaborn as sns
import matplotlib.pyplot as plt

# Set Seaborn style for publication-quality plots
sns.set_theme(style="whitegrid", font_scale=1.2)

# Load and split datasets
cornbelters_files = glob.glob("../../CornBeltersData/*.csv")
kcl_files = glob.glob("../../KCLData/*.csv")

# Read and concatenate KCL files
kcl_dfs = [pd.read_csv(f) for f in kcl_files]
kcl_df = pd.concat(kcl_dfs, ignore_index=True) if kcl_dfs else pd.DataFrame()

# Read and concatenate CornBelters files
cornbelters_dfs = [pd.read_csv(f) for f in cornbelters_files]
cornbelters_df = pd.concat(cornbelters_dfs, ignore_index=True) if cornbelters_dfs else pd.DataFrame()
cornbelters_df = cornbelters_df[cornbelters_df['PitcherTeam'] == 'Normal cornbelters']
# Define offspeed pitches
offspeed_pitches = ['Sinker', 'Curveball', 'Slider', 'Cutter']

# Difference calculation functions
def calculate_ff_diff(event, df_ref):
    if pd.notna(event['RelSpeed']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df_ref[(df_ref['Pitcher'] == event['Pitcher']) & 
                              (df_ref['TaggedPitchType'] == 'Fastball')]['RelSpeed'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['RelSpeed']
    return np.nan

def calculate_ivb_diff(event, df_ref):
    if pd.notna(event['InducedVertBreak']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df_ref[(df_ref['Pitcher'] == event['Pitcher']) & 
                              (df_ref['TaggedPitchType'] == 'Fastball')]['InducedVertBreak'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['InducedVertBreak']
    return np.nan

def calculate_hb_diff(event, df_ref):
    if pd.notna(event['HorzBreak']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df_ref[(df_ref['Pitcher'] == event['Pitcher']) & 
                              (df_ref['TaggedPitchType'] == 'Fastball')]['HorzBreak'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['HorzBreak']
    return np.nan

def calculate_angle_diff(event, df_ref):
    if pd.notna(event['VertRelAngle']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df_ref[(df_ref['Pitcher'] == event['Pitcher']) & 
                              (df_ref['TaggedPitchType'] == 'Fastball')]['VertRelAngle'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['VertRelAngle']
    return np.nan

# Define trained dummy columns
trained_dummy_columns = [
    'PitchType_Changeup', 'PitchType_Curveball', 'PitchType_Cutter',
    'PitchType_Fastball', 'PitchType_Knuckleball', 'PitchType_Sinker',
    'PitchType_Slider', 'PitchType_Splitter'
]

# Define model features
features = ['RelSpeed', 'SpinRate', 'RelHeight', 'RelSide', 'Extension',
            'InducedVertBreak', 'VertRelAngle', 'HorzBreak', 'VertApprAngle',
            'ZoneSpeed', 'ff_diff', 'ivb_diff', 'hb_diff', 'ang_diff'] + trained_dummy_columns

# Load Stuff+ model
with open("../stuff_plus_model.pkl", "rb") as f:
    stuff_plus_model = pickle.load(f)

def predict_stuff_plus(event):
    if all(item in event for item in features):
        mean = -0.021042198
        std = 0.1688992037777563
        scale_factor = 10
        input_data = event[features].values.reshape(1, -1)
        stuff_plus = stuff_plus_model.predict(input_data)[0]
        stuff_plus_scaled = 100 + ((stuff_plus - mean) / std * scale_factor)
        return stuff_plus_scaled
    return np.nan

def process_dataset(df, dataset_name):
    if df.empty:
        print(f"No data in {dataset_name} dataset.")
        return None, None, None, None

    # Apply difference calculations
    df['ff_diff'] = df.apply(lambda x: calculate_ff_diff(x, df), axis=1)
    df['ivb_diff'] = df.apply(lambda x: calculate_ivb_diff(x, df), axis=1)
    df['hb_diff'] = df.apply(lambda x: calculate_hb_diff(x, df), axis=1)
    df['ang_diff'] = df.apply(lambda x: calculate_angle_diff(x, df), axis=1)

    # One-hot encode pitch types
    dummies = pd.get_dummies(df['TaggedPitchType'], prefix='PitchType', dtype=float)
    df = pd.concat([df, dummies], axis=1)

    # Add missing dummy columns with 0s
    for col in trained_dummy_columns:
        if col not in df.columns:
            df[col] = 0.0

    # Drop extra dummy columns
    df = df.drop(columns=[col for col in df.columns if col.startswith('PitchType_') and col not in trained_dummy_columns])

    # Remove duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]

    # Apply Stuff+ prediction
    df['Stuff+'] = df.apply(predict_stuff_plus, axis=1)

    # Debug: Check unique pitch types per pitcher
    pitch_counts = df.groupby('Pitcher')['TaggedPitchType'].nunique().reset_index()
    print(f"\nPitchers with multiple pitch types ({dataset_name}):")
    print(pitch_counts[pitch_counts['TaggedPitchType'] > 1].to_string(index=False))

    # --- Table and Plot 1: Top 15 Pitchers by Average Stuff+ ---
    avg_stuff_plus = df.groupby('Pitcher')['Stuff+'].mean().reset_index()
    top_15_pitchers = avg_stuff_plus.sort_values('Stuff+', ascending=False).head(15)
    top_15_pitchers['Stuff+'] = top_15_pitchers['Stuff+'].round(2)
    top_15_pitchers.columns = ['Pitcher', 'Average Stuff+']

    print(f"\nTop 15 Pitchers by Average Stuff+ ({dataset_name}):")
    print(top_15_pitchers.to_string(index=False))
    top_15_pitchers.to_csv(f'top_15_pitchers_stuff_plus_{dataset_name}.csv', index=False)

    plt.figure(figsize=(10, 6))
    ax = sns.barplot(data=top_15_pitchers, x='Average Stuff+', y='Pitcher', palette='coolwarm')
    plt.title(f'Top 15 Pitchers by Average Stuff+ ({dataset_name})', fontsize=14, pad=15)
    plt.xlabel('Average Stuff+', fontsize=12)
    plt.ylabel('Pitcher', fontsize=12)
    max_value = top_15_pitchers['Average Stuff+'].max()
    plt.xlim(0, max_value + 10)  # Extend x-axis to accommodate text
    for i, v in enumerate(top_15_pitchers['Average Stuff+']):
        ax.text(v + 1, i, f'{v:.2f}', va='center', fontsize=10)  # Adjust offset
    plt.tight_layout()
    plt.savefig(f'top_15_pitchers_stuff_plus_{dataset_name}.png', dpi=300, bbox_inches='tight')
    plt.close()

    # --- Table and Plot 2: Top 15 Pitchers Across All Pitch Types by Stuff+ ---
    avg_stuff_by_pitch = df.groupby(['Pitcher', 'TaggedPitchType'])['Stuff+'].mean().reset_index()
    top_15_all_pitches = avg_stuff_by_pitch.sort_values('Stuff+', ascending=False).head(15)
    top_15_all_pitches['Stuff+'] = top_15_all_pitches['Stuff+'].round(2)
    top_15_all_pitches.columns = ['Pitcher', 'Pitch Type', 'Average Stuff+']

    # Debug: Print all pitch type-pitcher combinations before taking top 15
    print(f"\nAll Pitcher-Pitch Type combinations before top 15 filter ({dataset_name}):")
    print(avg_stuff_by_pitch.sort_values('Stuff+', ascending=False).head(20).to_string(index=False))

    print(f"\nTop 15 Pitchers Across All Pitch Types by Stuff+ ({dataset_name}):")
    print(top_15_all_pitches.to_string(index=False))
    top_15_all_pitches.to_csv(f'top_15_all_pitches_stuff_plus_{dataset_name}.csv', index=False)

    top_15_all_pitches['Label'] = top_15_all_pitches['Pitcher'] + ' (' + top_15_all_pitches['Pitch Type'] + ')'

    plt.figure(figsize=(15, 10))
    ax = sns.barplot(data=top_15_all_pitches, x='Average Stuff+', y='Label', palette='coolwarm')
    plt.title(f'Top 15 Pitchers Across All Pitch Types by Stuff+ ({dataset_name})', fontsize=14, pad=15)
    plt.xlabel('Average Stuff+', fontsize=12)
    plt.ylabel('Pitcher (Pitch Type)', fontsize=12)
    max_value = top_15_all_pitches['Average Stuff+'].max()
    plt.xlim(0, max_value + 10)  # Extend x-axis to accommodate text
    for i, v in enumerate(top_15_all_pitches['Average Stuff+']):
        ax.text(v + 1, i, f'{v:.2f}', va='center', fontsize=10)  # Adjust offset
    plt.tight_layout()
    plt.savefig(f'top_15_all_pitches_stuff_plus_{dataset_name}.png', dpi=300, bbox_inches='tight')
    plt.close()

    return top_15_pitchers, top_15_all_pitches
# Process both datasets
kcl_top_15_pitchers, kcl_top_15_all_pitches = process_dataset(kcl_df, "KCL")
cornbelters_top_15_pitchers, cornbelters_top_15_all_pitches = process_dataset(cornbelters_df, "CornBelters")




Pitchers with multiple pitch types (KCL):
             Pitcher  TaggedPitchType
         Adan Nieves                5
          Andrew Tay                5
Benedict Hendrickson                2
         Bob Tomhave                2
    Braden Alexander                4
     Braden Deverman                3
         Braden Mehn                6
        Brandon Ward                3
     Brayden Elliott                5
       Brayden Zilis                2
     Brennan Tomhave                3
       Brett Granger                5
   Brody Stonecipher                3
       Cade Starrick                3
         Caden Addis                4
         Cal Darling                2
          Caleb Ochs                4
     Cameron Clifton                5
      Canden Hardman                5
      Carter Sellers                3
         Cayden Rose                6
       Cole Mcclusky                4
         Connor Hale                3
       Cooper Stolfa                7
       


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=top_15_pitchers, x='Average Stuff+', y='Pitcher', palette='coolwarm')



All Pitcher-Pitch Type combinations before top 15 filter (KCL):
         Pitcher TaggedPitchType     Stuff+
   Brett Granger        Splitter 129.685776
   Cooper Stolfa        Splitter 124.771263
     Cayden Rose        Changeup 123.179855
   Brett Granger        Changeup 122.943481
    Joey Kahwaji     Knuckleball 121.655556
   Cooper Stolfa        Changeup 120.067673
  Canden Hardman        Changeup 119.983154
 Brennan Tomhave        Changeup 118.653900
    Nolan Bowles        Changeup 118.567390
Teagan Disharoom        Splitter 118.420212
     Adan Nieves        Changeup 118.015388
     Jack Wisdom        Changeup 117.954971
   Cole Mcclusky        Changeup 117.832405
    Devan Tupper        Changeup 117.801041
  Nolan Vanduzer        Changeup 117.678772
      Jack Piper        Changeup 117.665245
 Patrick Kennedy        Changeup 117.378815
     Lucas Frank        Changeup 117.003937
    Noah Suttles        Changeup 117.000175
    Payton Knoll        Changeup 116.536430

Top 15 Pit


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=top_15_all_pitches, x='Average Stuff+', y='Label', palette='coolwarm')



Pitchers with multiple pitch types (CornBelters):
        Pitcher  TaggedPitchType
   Alec Bergman                2
Braden Vanderhe                4
   Brice Deaton                3
   Cole Noreuil                4
    Dom Panozzo                2
Dominic Panozzo                2
  Eli Tritinger                3
    Eli Woodall                2
   Graham Kasey                7
     Isaac Graf                2
Jackson Mcdonal                4
Jackson Stewart                2
    Jake Fenton                3
 Jimmy Amptmann                3
  Logan Barnett                4
   Luka Zachman                4
  Mason Orton\t                2
Nicholas Currie                3
   Nick Krueger                4
   Owen Corbett                3
     Rj Bergren                3
 Toby Schriefer                4
    Trey Bryant                3
  Will O'gorman                4
   Wyatt Mammen                4
   Zach Courson                3
 Zach O'Donnell                3
 Zach O'donnell          


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=top_15_pitchers, x='Average Stuff+', y='Pitcher', palette='coolwarm')
  plt.tight_layout()
  plt.savefig(f'top_15_pitchers_stuff_plus_{dataset_name}.png', dpi=300, bbox_inches='tight')



All Pitcher-Pitch Type combinations before top 15 filter (CornBelters):
        Pitcher TaggedPitchType     Stuff+
   Graham Kasey        Splitter 114.864639
   Graham Kasey        Changeup 112.987427
   Nick Krueger        Changeup 111.943657
Nicholas Currie        Changeup 111.812225
  Eli Tritinger        Changeup 111.693977
 Zach O'donnell        Changeup 111.035957
     Isaac Graf        Fastball 110.997688
    Trey Bryant        Fastball 110.816505
  Will O'gorman        Changeup 110.709343
 Jimmy Amptmann        Fastball 110.696518
   Luka Zachman        Fastball 110.687607
   Cole Noreuil        Changeup 110.672592
   Luka Zachman        Changeup 110.489624
Nicholas Currie        Fastball 110.318756
 Zach O'Donnell        Changeup 110.214401
  Logan Barnett        Fastball 110.162163
Braden Vanderhe        Changeup 110.149475
    Trey Bryant        Changeup 110.099266
Braden Vanderhe        Fastball 110.021706
Jackson Stewart        Fastball 109.929642

Top 15 Pitchers Across 


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=top_15_all_pitches, x='Average Stuff+', y='Label', palette='coolwarm')
