In [4]:
import pandas as pd
import numpy as np
import pickle
import glob
import seaborn as sns
import matplotlib.pyplot as plt

# Set Seaborn style for publication-quality plots
sns.set_theme(style="whitegrid", font_scale=1.2)

# Load and split datasets
cornbelters_files = glob.glob("../../CornBeltersData/*.csv")
kcl_files = glob.glob("../../KCLData/*.csv")

# Read and concatenate KCL files
kcl_dfs = [pd.read_csv(f) for f in kcl_files]
kcl_df = pd.concat(kcl_dfs, ignore_index=True) if kcl_dfs else pd.DataFrame()

# Read and concatenate CornBelters files
cornbelters_dfs = [pd.read_csv(f) for f in cornbelters_files]
cornbelters_df = pd.concat(cornbelters_dfs, ignore_index=True) if cornbelters_dfs else pd.DataFrame()
cornbelters_df = cornbelters_df[cornbelters_df['PitcherTeam'] == 'Normal cornbelters']
# Define offspeed pitches
offspeed_pitches = ['Sinker', 'Curveball', 'Slider', 'Cutter']

# Difference calculation functions
def calculate_ff_diff(event, df_ref):
    if pd.notna(event['RelSpeed']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df_ref[(df_ref['Pitcher'] == event['Pitcher']) & 
                              (df_ref['TaggedPitchType'] == 'Fastball')]['RelSpeed'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['RelSpeed']
    return np.nan

def calculate_ivb_diff(event, df_ref):
    if pd.notna(event['InducedVertBreak']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df_ref[(df_ref['Pitcher'] == event['Pitcher']) & 
                              (df_ref['TaggedPitchType'] == 'Fastball')]['InducedVertBreak'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['InducedVertBreak']
    return np.nan

def calculate_hb_diff(event, df_ref):
    if pd.notna(event['HorzBreak']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df_ref[(df_ref['Pitcher'] == event['Pitcher']) & 
                              (df_ref['TaggedPitchType'] == 'Fastball')]['HorzBreak'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['HorzBreak']
    return np.nan

def calculate_angle_diff(event, df_ref):
    if pd.notna(event['VertRelAngle']) and pd.notna(event['TaggedPitchType']) and event['TaggedPitchType'] in offspeed_pitches:
        fastball_avg = df_ref[(df_ref['Pitcher'] == event['Pitcher']) & 
                              (df_ref['TaggedPitchType'] == 'Fastball')]['VertRelAngle'].mean()
        if pd.notna(fastball_avg):
            return fastball_avg - event['VertRelAngle']
    return np.nan

# Define trained dummy columns
trained_dummy_columns = [
    'PitchType_Changeup', 'PitchType_Curveball', 'PitchType_Cutter',
    'PitchType_Fastball', 'PitchType_Knuckleball', 'PitchType_Sinker',
    'PitchType_Slider', 'PitchType_Splitter'
]

# Define model features
features = ['RelSpeed', 'SpinRate', 'RelHeight', 'RelSide', 'Extension',
            'InducedVertBreak', 'VertRelAngle', 'HorzBreak', 'VertApprAngle',
            'ZoneSpeed', 'ff_diff', 'ivb_diff', 'hb_diff', 'ang_diff'] + trained_dummy_columns

# Load Stuff+ model
with open("../stuff_plus_model.pkl", "rb") as f:
    stuff_plus_model = pickle.load(f)

def predict_stuff_plus(event):
    if all(item in event for item in features):
        # Given values
        mean = -0.048827421489919115
        std = 0.254953021360529
        scale_factor = 100  # Controls spread; one std moves score by 10

        # Convert event[features] to a 2D array for prediction
        input_data = event[features].values.reshape(1, -1)

        # Predict stuff_plus
        stuff_plus = stuff_plus_model.predict(input_data)[0]

        # Scale to make 100 the average
        stuff_plus_scaled = 100 + ((stuff_plus - mean) / std * scale_factor)

        return stuff_plus_scaled # Return scalar value
    else:
        return np.nan


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x93 in position 14: invalid start byte

In [None]:
def process_dataset(df, dataset_name):
    if df.empty:
        print(f"No data in {dataset_name} dataset.")
        return None, None, None, None
    df['Pitcher'] = df['Pitcher'].replace('Bob Tomhave', 'Brennan Tomhave')

    # Apply difference calculations
    df['ff_diff'] = df.apply(lambda x: calculate_ff_diff(x, df), axis=1)
    df['ivb_diff'] = df.apply(lambda x: calculate_ivb_diff(x, df), axis=1)
    df['hb_diff'] = df.apply(lambda x: calculate_hb_diff(x, df), axis=1)
    df['ang_diff'] = df.apply(lambda x: calculate_angle_diff(x, df), axis=1)

    # One-hot encode pitch types
    dummies = pd.get_dummies(df['TaggedPitchType'], prefix='PitchType', dtype=float)
    df = pd.concat([df, dummies], axis=1)

    # Add missing dummy columns with 0s
    for col in trained_dummy_columns:
        if col not in df.columns:
            df[col] = 0.0

    # Drop extra dummy columns
    df = df.drop(columns=[col for col in df.columns if col.startswith('PitchType_') and col not in trained_dummy_columns])

    # Remove duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]

    # Apply Stuff+ prediction
    df['Stuff+'] = df.apply(predict_stuff_plus, axis=1)


    pitcher_counts = df['Pitcher'].value_counts()
    valid_pitchers = pitcher_counts[pitcher_counts >= 20].index
    df = df[df['Pitcher'].isin(valid_pitchers)]

    pitch_type_counts = df.groupby(['Pitcher', 'TaggedPitchType']).size().reset_index(name='Count')
    valid_combos = pitch_type_counts[pitch_type_counts['Count'] >= 10][['Pitcher', 'TaggedPitchType']]
    df = df.merge(valid_combos, on=['Pitcher', 'TaggedPitchType'], how='inner')

    # Debug: Check unique pitch types per pitcher
    pitch_counts = df.groupby('Pitcher')['TaggedPitchType'].nunique().reset_index()
    print(f"\nPitchers with multiple pitch types ({dataset_name}):")
    print(pitch_counts[pitch_counts['TaggedPitchType'] > 1].to_string(index=False))

    # --- Table and Plot 1: Top 15 Pitchers by Average Stuff+ ---
    avg_stuff_plus = df.groupby('Pitcher')['Stuff+'].mean().reset_index()
    top_15_pitchers = avg_stuff_plus.sort_values('Stuff+', ascending=False).head(15)
    top_15_pitchers['Stuff+'] = top_15_pitchers['Stuff+'].round(2)
    top_15_pitchers.columns = ['Pitcher', 'Average Stuff+']

    print(f"\nTop 15 Pitchers by Average Stuff+ ({dataset_name}):")
    print(top_15_pitchers.to_string(index=False))

    plt.figure(figsize=(10, 6))
    ax = sns.barplot(data=top_15_pitchers, x='Average Stuff+', y='Pitcher', palette='coolwarm')
    plt.title(f'Top 15 Pitchers by Average Stuff+ ({dataset_name})', fontsize=14, pad=15)
    plt.xlabel('Average Stuff+', fontsize=12)
    plt.ylabel('Pitcher', fontsize=12)
    max_value = top_15_pitchers['Average Stuff+'].max()
    plt.xlim(0, max_value + 10)  # Extend x-axis to accommodate text
    for i, v in enumerate(top_15_pitchers['Average Stuff+']):
        ax.text(v + 1, i, f'{v:.2f}', va='center', fontsize=10)  # Adjust offset
    plt.tight_layout()
    plt.savefig(f'top_15_pitchers_stuff_plus_{dataset_name}.png', dpi=300, bbox_inches='tight')
    plt.close()

    # --- Table and Plot 2: Top 15 Pitchers Across All Pitch Types by Stuff+ ---
    avg_stuff_by_pitch = df.groupby(['Pitcher', 'TaggedPitchType'])['Stuff+'].mean().reset_index()
    top_15_all_pitches = avg_stuff_by_pitch.sort_values('Stuff+', ascending=False).head(15)
    top_15_all_pitches['Stuff+'] = top_15_all_pitches['Stuff+'].round(2)
    top_15_all_pitches.columns = ['Pitcher', 'Pitch Type', 'Average Stuff+']

    # Debug: Print all pitch type-pitcher combinations before taking top 15
    print(f"\nAll Pitcher-Pitch Type combinations before top 15 filter ({dataset_name}):")
    print(avg_stuff_by_pitch.sort_values('Stuff+', ascending=False).head(20).to_string(index=False))

    print(f"\nTop 15 Pitchers Across All Pitch Types by Stuff+ ({dataset_name}):")
    print(top_15_all_pitches.to_string(index=False))

    top_15_all_pitches['Label'] = top_15_all_pitches['Pitcher'] + ' (' + top_15_all_pitches['Pitch Type'] + ')'

    plt.figure(figsize=(15, 10))
    ax = sns.barplot(data=top_15_all_pitches, x='Average Stuff+', y='Label', palette='coolwarm')
    plt.title(f'Top 15 Pitchers Across All Pitch Types by Stuff+ ({dataset_name})', fontsize=14, pad=15)
    plt.xlabel('Average Stuff+', fontsize=12)
    plt.ylabel('Pitcher (Pitch Type)', fontsize=12)
    max_value = top_15_all_pitches['Average Stuff+'].max()
    plt.xlim(0, max_value + 10)  # Extend x-axis to accommodate text
    for i, v in enumerate(top_15_all_pitches['Average Stuff+']):
        ax.text(v + 1, i, f'{v:.2f}', va='center', fontsize=10)  # Adjust offset
    plt.tight_layout()
    plt.savefig(f'top_15_all_pitches_stuff_plus_{dataset_name}.png', dpi=300, bbox_inches='tight')
    plt.close()

    return top_15_pitchers, top_15_all_pitches
# Process both datasets
kcl_top_15_pitchers, kcl_top_15_all_pitches = process_dataset(kcl_df, "KCL")
cornbelters_top_15_pitchers, cornbelters_top_15_all_pitches = process_dataset(cornbelters_df, "CornBelters")




Pitchers with multiple pitch types (KCL):
             Pitcher  TaggedPitchType
         Adan Nieves                4
          Andrew Tay                3
Benedict Hendrickson                2
    Braden Alexander                3
     Braden Deverman                3
         Braden Mehn                4
        Brandon Ward                2
     Brayden Elliott                2
     Brennan Tomhave                2
       Brett Granger                3
   Brody Stonecipher                3
       Cade Starrick                3
         Caden Addis                4
          Caleb Ochs                3
     Cameron Clifton                5
      Canden Hardman                4
      Carter Sellers                3
         Cayden Rose                6
       Cole Mcclusky                3
         Connor Hale                2
       Cooper Stolfa                2
        Devan Tupper                3
        Donte Frantz                2
          Ethan Plym                3
       


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=top_15_pitchers, x='Average Stuff+', y='Pitcher', palette='coolwarm')



All Pitcher-Pitch Type combinations before top 15 filter (KCL):
        Pitcher TaggedPitchType     Stuff+
  Cade Starrick        Changeup 104.998474
Patrick Kennedy          Slider 104.952179
   Nolan Bowles          Slider 104.482117
  Ethan Robbins          Slider 104.422279
    Cayden Rose       Curveball 104.223427
    Adan Nieves          Cutter 104.212128
 Nolan Vanduzer        Fastball 103.973335
Mitchell Wilson          Slider 103.848190
    Luke Klunke          Slider 103.580452
 Canden Hardman          Cutter 103.404251
    Mason Orton        Fastball 103.342224
    Cayden Rose          Slider 103.331245
    Logan Lynch       Curveball 103.143181
   Devan Tupper        Splitter 103.032341
    Connor Hale          Slider 102.940247
    Braden Mehn          Cutter 102.674637
    Caden Addis          Cutter 102.542633
   Brandon Ward       Curveball 102.483727
   Reed Frazier          Slider 102.441917
 Madden Johnson          Slider 102.352844

Top 15 Pitchers Across All Pitc


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=top_15_all_pitches, x='Average Stuff+', y='Label', palette='coolwarm')



Pitchers with multiple pitch types (CornBelters):
        Pitcher  TaggedPitchType
   Alec Bergman                2
   Brice Deaton                2
Dominic Panozzo                2
  Eli Tritinger                2
   Graham Kasey                5
Jackson Mcdonal                2
    Jake Fenton                2
 Jimmy Amptmann                2
  Logan Barnett                3
   Luka Zachman                3
  Mason Orton\t                2
Nicholas Currie                3
   Nick Krueger                3
     Rj Bergren                2
 Toby Schriefer                3
    Trey Bryant                2
  Will O'gorman                2
   Wyatt Mammen                3
   Zach Courson                2
 Zach O'donnell                3

Top 15 Pitchers by Average Stuff+ (CornBelters):
        Pitcher  Average Stuff+
  Mason Orton\t      103.010002
    Trey Bryant      102.500000
   Brice Deaton      101.360001
    Jake Fenton      101.330002
     Isaac Graf      101.169998
   Wyatt Mamme


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=top_15_pitchers, x='Average Stuff+', y='Pitcher', palette='coolwarm')
  plt.tight_layout()
  plt.savefig(f'top_15_pitchers_stuff_plus_{dataset_name}.png', dpi=300, bbox_inches='tight')



All Pitcher-Pitch Type combinations before top 15 filter (CornBelters):
        Pitcher TaggedPitchType     Stuff+
  Mason Orton\t          Slider 103.528358
   Alec Bergman          Slider 103.447937
    Trey Bryant        Fastball 103.141663
   Graham Kasey       Curveball 102.939728
  Mason Orton\t        Fastball 102.725723
   Wyatt Mammen       Curveball 102.688576
   Nick Krueger          Slider 102.524765
 Jimmy Amptmann        Fastball 102.347321
    Jake Fenton        Fastball 101.923233
Jackson Mcdonal       Curveball 101.874718
Nicholas Currie          Slider 101.829071
    Trey Bryant          Slider 101.814751
  Logan Barnett          Sinker 101.726006
   Brice Deaton        Fastball 101.699615
 Toby Schriefer       Curveball 101.555328
     Isaac Graf        Fastball 101.174126
   Graham Kasey        Changeup 101.158386
 Toby Schriefer          Slider 101.148979
   Wyatt Mammen        Fastball 100.847443
Dominic Panozzo        Fastball 100.845070

Top 15 Pitchers Across 


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=top_15_all_pitches, x='Average Stuff+', y='Label', palette='coolwarm')
  plt.tight_layout()
  plt.savefig(f'top_15_all_pitches_stuff_plus_{dataset_name}.png', dpi=300, bbox_inches='tight')


In [None]:
def rank_teams_in_kcl(df, dataset_name="KCL"):
    if df.empty:
        print(f"No data in {dataset_name} dataset.")
        return None, None, None, None
    df['Pitcher'] = df['Pitcher'].replace('Bob Tomhave', 'Brennan Tomhave')

    # Apply difference calculations
    df['ff_diff'] = df.apply(lambda x: calculate_ff_diff(x, df), axis=1)
    df['ivb_diff'] = df.apply(lambda x: calculate_ivb_diff(x, df), axis=1)
    df['hb_diff'] = df.apply(lambda x: calculate_hb_diff(x, df), axis=1)
    df['ang_diff'] = df.apply(lambda x: calculate_angle_diff(x, df), axis=1)

    # One-hot encode pitch types
    dummies = pd.get_dummies(df['TaggedPitchType'], prefix='PitchType', dtype=float)
    df = pd.concat([df, dummies], axis=1)

    # Add missing dummy columns with 0s
    for col in trained_dummy_columns:
        if col not in df.columns:
            df[col] = 0.0
    base_teams = [
        "Kcl groundsloths",
        "Kcl bobcats",
        "Kcl bluecaps",
        "Kcl merchants"
    ]
    
    # Create a mapping for years 2026 to 2030 back to 2025
    team_mapping = {}
    for team in base_teams:
        for year in range(2026, 2031):
            team_mapping[f"{team} {year}"] = f"{team} 2025"

    # Apply mapping
    df['PitcherTeam'] = df['PitcherTeam'].replace(team_mapping)

   
    # Drop extra dummy columns
    df = df.drop(columns=[col for col in df.columns if col.startswith('PitchType_') and col not in trained_dummy_columns])

    # Remove duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]

    # Apply Stuff+ prediction
    df['Stuff+'] = df.apply(predict_stuff_plus, axis=1)
    if df.empty:
        print("KCL dataset is empty.")
        return None

    # Clean up team names if needed (optional, for consistent formatting)
    df['Team'] = df['PitcherTeam'].str.strip().str.title()

    # Group by team and calculate average Stuff+
    avg_team_stuff = df.groupby('Team')['Stuff+'].mean().reset_index()
    avg_team_stuff = avg_team_stuff.sort_values('Stuff+', ascending=False)
    avg_team_stuff['Stuff+'] = avg_team_stuff['Stuff+'].round(2)

    print("\nKCL Team Stuff+ Rankings:")
    print(avg_team_stuff.to_string(index=False))

    # --- Plot: Team Average Stuff+ ---
    plt.figure(figsize=(8, 6))
    ax = sns.barplot(data=avg_team_stuff, x='Stuff+', y='Team', palette='viridis')
    plt.title('Average Team Stuff+ (KCL)', fontsize=14, pad=15)
    plt.xlabel('Average Stuff+', fontsize=12)
    plt.ylabel('Team', fontsize=12)
    max_val = avg_team_stuff['Stuff+'].max()
    plt.xlim(0, max_val + 10)
    for i, v in enumerate(avg_team_stuff['Stuff+']):
        ax.text(v + 1, i, f'{v:.2f}', va='center', fontsize=10)
    plt.tight_layout()
    plt.savefig('kcl_team_stuff_plus_rankings.png', dpi=300, bbox_inches='tight')
    plt.close()

    return avg_team_stuff
kcl_team_rankings = rank_teams_in_kcl(kcl_df, "KCL")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=avg_team_stuff, x='Stuff+', y='Team', palette='viridis')



KCL Team Stuff+ Rankings:
                 Team     Stuff+
    Kcl Bluecaps 2025 100.559998
Kcl Groundsloths 2025 100.220001
     Kcl Bobcats 2025 100.150002
   Kcl Merchants 2025 100.129997
