In [54]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier
import joblib

# ^Importing all needed libraries and tools

pd.set_option('future.no_silent_downcasting', True)

csv_directory = 'File-pathLocation' #fetches all CSV files in location (Will need an actual file path)
all_files = [os.path.join(csv_directory, f) for f in os.listdir(csv_directory) if f.endswith('.csv')] # Starting from here
df_list = []
for file in all_files:
    df = pd.read_csv(file)
    df_list.append(df)
combined_df = pd.concat(df_list, ignore_index=True) # This loop gathers all CSV files in location and combines them into a single one.

combined_df = combined_df[~combined_df['pitch_type'].str.contains('KN')] # Removes knuckle balls from data; Too few instances
combined_df = combined_df[~combined_df['pitch_type'].str.contains('SV')] # Removes slurves from data; Too few instances
combined_df = combined_df[~combined_df['pitch_type'].str.contains('SC')] # Removes screwballs from data; Too few instances
combined_df = combined_df[~combined_df['pitch_type'].str.contains('CS')] # Removes slow curves from data; Too few instances

combined_df['pitch_type'] = combined_df['pitch_type'].replace('FF', 'F') # Combines Fastballs (2 & 4 seam) and Cutters to simplify predictions
combined_df['pitch_type'] = combined_df['pitch_type'].replace('SI', 'F')
combined_df['pitch_type'] = combined_df['pitch_type'].replace('FC', 'F')

combined_df['pitch_type'] = combined_df['pitch_type'].replace('CH', 'O') # Combines Changeups, Split finger, & Forkballs to simplify predictions
combined_df['pitch_type'] = combined_df['pitch_type'].replace('FS', 'O')
combined_df['pitch_type'] = combined_df['pitch_type'].replace('FO', 'O')

combined_df['pitch_type'] = combined_df['pitch_type'].replace('CU', 'C') # Combines curve balls and knuckle curves to simplify predictions
combined_df['pitch_type'] = combined_df['pitch_type'].replace('KC', 'C')

combined_df['pitch_type'] = combined_df['pitch_type'].replace('SL', 'S') # Combines sliders and Sweepers to simplify predictions
combined_df['pitch_type'] = combined_df['pitch_type'].replace('ST', 'S')

X2 = combined_df[['stand', 'p_throws', 'balls', 'strikes', 'outs_when_up', 'inning', 'bat_score', 'fld_score', 'if_fielding_alignment', 'of_fielding_alignment', 'on_3b', 'on_2b', 'on_1b', 'pitch_number', 'release_speed', 'pfx_x', 'pfx_z']]
X1 = combined_df[['stand', 'p_throws', 'balls', 'strikes', 'outs_when_up', 'inning', 'bat_score', 'fld_score', 'if_fielding_alignment', 'of_fielding_alignment', 'on_3b', 'on_2b', 'on_1b', 'pitch_number']]
# X1 Takes in batter & pitcher handedness, current count; outs; inning; score; field alignment; runners & pitch number. E.G. every gathered data point before a pitch was thrown.
# X2 takes the same conditions but with the percieved release speed of the pitch, and the horizontal/vertical movement of the pitch. E.G Three more key data points after the pitch was thrown.

y = combined_df[['pitch_type']] # Logs subsequent pitch type determined.
X1 = X1.fillna(0) # Replaces empty entries with a '0'
X1['on_3b'] = X1['on_3b'].astype(bool).astype(int) # Converts runner ID numbers to 1 or 0 for true and false conditions; converts baserunner data to be usable by the model.
X1['on_2b'] = X1['on_2b'].astype(bool).astype(int)
X1['on_1b'] = X1['on_1b'].astype(bool).astype(int)

X1['if_fielding_alignment'] = X['if_fielding_alignment'].astype('category').cat.codes.astype(float) # Converts string (ex 'Standard') data into float data, each different instance being a different number
X1['of_fielding_alignment'] = X['of_fielding_alignment'].astype('category').cat.codes.astype(float)

X1['stand'] = X1['stand'].replace('R', 1).infer_objects(copy=False) # Converts 'R' handedness to 1
X1['stand'] = X1['stand'].replace('L', 2).infer_objects(copy=False) # Converts 'L' handedness to 2
X1['p_throws'] = X1['p_throws'].replace('R', 1).infer_objects(copy=False) # Converts 'R' handedness to 1
X1['p_throws'] = X1['p_throws'].replace('L', 2).infer_objects(copy=False) # Converts 'L' handedness to 2

X2 = X2.fillna(0)  # Starting from here
X2['on_3b'] = X2['on_3b'].astype(bool).astype(int)
X2['on_2b'] = X2['on_2b'].astype(bool).astype(int)
X2['on_1b'] = X2['on_1b'].astype(bool).astype(int)

X2['if_fielding_alignment'] = X2['if_fielding_alignment'].astype('category').cat.codes.astype(float)
X2['of_fielding_alignment'] = X2['of_fielding_alignment'].astype('category').cat.codes.astype(float)

X2['stand'] = X2['stand'].replace('R', 1).infer_objects(copy=False)
X2['stand'] = X2['stand'].replace('L', 2).infer_objects(copy=False)
X2['p_throws'] = X2['p_throws'].replace('R', 1).infer_objects(copy=False)
X2['p_throws'] = X2['p_throws'].replace('L', 2).infer_objects(copy=False)  # To here, does the same as above, but for X2.

X1_train,X1_test,y_train,y_test = train_test_split(X1,y,test_size=0.2) # Splits data 80:20, 80% for training and 20% for testing.
X2_train,X2_test,y_train,y_test = train_test_split(X2,y,test_size=0.2)

model1 = HistGradientBoostingClassifier(max_iter=100).fit(X1_train, y_train.values.ravel()) #Trains each model.
model2 = HistGradientBoostingClassifier(max_iter=100).fit(X2_train, y_train.values.ravel())
print(str(int(model1.score(X1_test,y_test)*100)) + "% Accuracy of model 1") # Calculates accuracy of each model.
print(str(int(model2.score(X2_test,y_test)*100)) + "% Accuracy of model 2")

model1 = HistGradientBoostingClassifier(max_iter=100).fit(X1, y.values.ravel()) # Retrains models with 100% of the data.
model2 = HistGradientBoostingClassifier(max_iter=100).fit(X2, y.values.ravel())

joblib.dump(model1, 'Pre-pitchPredictor.joblib') # Saves each model as a file in the same location as this project
joblib.dump(model2, 'Post-pitchPredictor.joblib')

55% Accuracy of model 1
92% Accuracy of model 2


['Post-pitchPredictor.joblib']