In [1]:
import pandas as pd
import pybaseball as pyb
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [2]:
#Rapsodo : Savant

gloss = {
    'Velocity' : 'release_speed'
    ,'Total Spin' : 'release_spin_rate'
    ,'True Spin (release)' : 'N/Eq'
    ,'Spin Efficiency (release)' : 'N/Eq'
    ,'Spin Direction' :'*spin_axis'
    ,'HB (trajectory)' : 'pfx_x'
    ,'VB (trajectory)' : 'pfx_z'
    ,'Release Height' : 'release_extension'
    ,'Release Side' : 'N/Eq'
    ,'Release Angle' : 'release_pos_z'
    ,'Horizontal Angle' : 'release_pos_x'
    ,'Gyro Degree (deg)' : 'N/Eq'
    ,'Pitch Type' :'pitch_type'
    ,'Is Strike' : 'Strike',
    'Player Name' : 'player_name'
}


#Pitch types to numbers
pitchtype = {'CH':5, 'FB':1, 'FA':1, 'SL':3, 'CU':2, 'FF':1, 'FS':1,
            'Fastball':1, 'CurveBall':2, 'Slider':3, '-':float('nan'), 'ChangeUp':5
             }

In [3]:
def clean_rapsodo_df(csv):

    all_seasions = pd.read_csv(csv)

    #change pitches to numbers
    all_seasions['Pitch Type'] = all_seasions['Pitch Type'].map(pitchtype)
    
    all_seasions = all_seasions.replace('-', np.nan)

    all_seasions[['Velocity', 'Total Spin', 'HB (trajectory)', 'VB (trajectory)', 'Release Height', 
             'Release Angle', 'Horizontal Angle', 'Pitch Type']] = all_seasions[['Velocity', 'Total Spin', 
             'HB (trajectory)', 'VB (trajectory)', 'Release Height', 'Release Angle', 
             'Horizontal Angle', 'Pitch Type']].astype(float)
    
    #Rapsodo velo adjuster 
    all_seasions['Velocity'] = all_seasions['Velocity'].apply(lambda x: x*1.0919273389060635)

    #rename columns to match savant data. N/Eq means no equivalent 
    all_seasions = pd.DataFrame(all_seasions).rename(columns=gloss)
    
    all_seasions = all_seasions[['player_name','release_speed','release_spin_rate','*spin_axis','pfx_x','pfx_z',
                                 'Strike','release_extension','release_pos_z','release_pos_x','pitch_type']]
    
    all_seasions.dropna(inplace=True)
    return all_seasions

def get_metrics(test, predict):
    accuracy = metrics.accuracy_score(test, predict)
    error = 1-accuracy
    return accuracy, error

#takes in a rapsodo csv and features to be used in the random forest

#returns a data frame of the rapsodo info and adds [player_comp] which is the indivual pitches mlb comp 
def random_forest_rapsodo(mlb_metrics, rapsodo_df ,features):
    
    x_train = mlb_metrics.sample(frac = .75)[features+ ['player_pitch']]
    x_test = mlb_metrics.drop(x_train.index)[features + ['player_pitch']]

    y_train = x_train['player_pitch'].astype('str')
    y_test = x_test['player_pitch'].astype('str')

    # Scale data
    for feat in x_train.columns[0:-1]:
        x_train[feat] = (x_train[feat] - x_train[feat].mean()) / x_train[feat].std()
        
    for feat in x_test.columns[0:-1]:
        x_test[feat] = (x_test[feat] - x_test[feat].mean()) / x_test[feat].std()

    x_test.drop(x_test.columns[-1], axis=1, inplace=True)
    x_train.drop(x_train.columns[-1], axis=1, inplace=True)
        
    #cross val descion tree
    scores_list = []

    for depth in range(1, 25):
        regressor = RandomForestClassifier(random_state=0, max_depth = depth)
        regressor.fit(x_train, y_train)
        y_pred_test = regressor.predict(x_test)
        accuracy, error = get_metrics(y_test, y_pred_test)
        if len(scores_list) != 0 and accuracy > max(scores_list):
            optimal_depth_ctree = depth
        scores_list.append(accuracy)

    X = mlb_metrics[features]

    y = mlb_metrics['player_pitch']

    classifier = KNeighborsClassifier(n_neighbors=optimal_depth_ctree)
    classifier.fit(X, y)

    rapsodo_df['player_comp'] = classifier.predict(rapsodo_df[features])
    
    return rapsodo_df, classifier

In [4]:
# Remove player names from rapsodo data for sharing

'''df = pd.read_csv('live.txt')
df.drop(columns=['Player ID'],inplace = True)

# create a dictionary mapping each unique name to a unique identifier
name_dict = {name: 'player{}'.format(i+1) for i, name in enumerate(df['player_name'].unique())}

# replace the original names with the unique identifiers
df['player_name'] = df['player_name'].replace(name_dict)

df.to_csv('anonymized_northeastern.csv', index=False)'''

"df = pd.read_csv('live.txt')\ndf.drop(columns=['Player ID'],inplace = True)\n\n# create a dictionary mapping each unique name to a unique identifier\nname_dict = {name: 'player{}'.format(i+1) for i, name in enumerate(df['player_name'].unique())}\n\n# replace the original names with the unique identifiers\ndf['player_name'] = df['player_name'].replace(name_dict)\n\ndf.to_csv('anonymized_northeastern.csv', index=False)"

In [5]:
# import every pitch of the 2022 season
season22 = pyb.statcast(start_dt='2022-06-01', end_dt='2022-06-20')

# only keep important metrics
mlb_metrics = season22[['player_name','release_speed','release_pos_x','release_pos_z','release_spin_rate',
                        'release_extension','effective_speed','spin_axis','pfx_z','pfx_x','pitch_type']].copy()
mlb_metrics.dropna(inplace=True)

# sort values in alphabetical order for ease when looking at 3d plot
mlb_metrics.sort_values('player_name', ascending=True, inplace=True)

# change the movement numbers from feet to inches 
mlb_metrics[['pfx_x', 'pfx_z']] *= 12

# change pitch type from letters to numbers
mlb_metrics['player_pitch'] = mlb_metrics['player_name']+' '+mlb_metrics['pitch_type']

# find which pitcher threw at least 100 pitches
pitch_count = mlb_metrics.groupby('player_name').count()
pitch_count = pitch_count.where(pitch_count > 100)

# save those who have qualified and save as a list
qualified = pitch_count[pitch_count['release_speed'].notna()]
qualified_lst = list(qualified['pfx_x'].keys())

# create df without unqualified pitchers 
mlb_metrics_qual = mlb_metrics[mlb_metrics['player_name'].isin(qualified_lst)]
rapsodo = 'pitchinggroup.csv'
features = ['release_speed','pfx_z','pfx_x']

# cleaned rapsodo data with numbers normalized to savant
sodo_df = pd.read_csv('anonymized_northeastern.csv')
sodo_df['player_pitch'] = sodo_df['player_name'].str.cat(sodo_df['pitch_type'].astype(str), sep=' ')

This is a large query, it may take a moment to complete


100%|███████████████████████████████████████████| 20/20 [00:22<00:00,  1.14s/it]


In [9]:
len(mlb_metrics_qual['player_name'].unique())

310

In [6]:
rapsodo_df, rf_clf = random_forest_rapsodo(mlb_metrics_qual,sodo_df,features)

  X = check_array(
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [7]:
pd.set_option('display.max_rows', 500)
player_comps = rapsodo_df[['player_name','pitch_type','player_comp']].groupby(['player_name','pitch_type']).max()
player_comps

Unnamed: 0_level_0,Unnamed: 1_level_0,player_comp
player_name,pitch_type,Unnamed: 2_level_1
player1,CU,"Ray, Robbie KC"
player1,FC,"Lynch, Daniel SL"
player1,FF,"Vizcaíno, Arodys SL"
player1,SL,"Vizcaíno, Arodys SL"
player10,CH,"Ray, Robbie KC"
player10,FF,"Kikuchi, Yusei SL"
player10,SL,"Ray, Robbie KC"
player11,CH,"Ray, Robbie KC"
player11,CU,"Irvin, Cole CU"
player11,FC,"Ray, Robbie KC"
