### Single Player Pitch Model
This notebook contains models that will take pitch information for a single player and will then train itself to predict future pitches that are thrown by that same player.
The notebook contains code for the following:
1. Random Forest Model
2. Gradient Boosting Model
3. Parameter Tuning Guidelines

In [None]:
import numpy as np
import pandas as pd
import argparse
import matplotlib.pyplot as plt
from pybaseball import pitching_stats
from pybaseball import playerid_lookup
from pybaseball import statcast_pitcher
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve
from sklearn.preprocessing import label_binarize

In [None]:
# Retrieve player data from baseballsavant via pybaseball api.
# Change the four variables below to select a specific player and date range.
first_name = 'felix'
last_name = 'hernandez'
start_date = '2010-01-01'
end_date = '2017-11-07'
id = playerid_lookup(last_name, first_name)
id_number = id.key_mlbam[0]
felix = statcast_pitcher(start_dt=start_date, end_dt=end_date, player_id=id_number)

In [None]:
# Chooses the features that will be used in the model.
df = felix[['pitch_type', 'balls', 'strikes', 'stand', 'outs_when_up', 'inning', 'on_3b', 'on_2b', 'on_1b', 'at_bat_number', 'pitch_number']]

In [None]:
# Replaces null values and maps some of the features to 1 and 0.
df = df.replace('null', 0)
df['on_1b'] = df['on_1b'].map(lambda x: 1 if x != 0 else 0)
df['on_2b'] = df['on_2b'].map(lambda x: 1 if x != 0 else 0)
df['on_3b'] = df['on_3b'].map(lambda x: 1 if x != 0 else 0)
df['stand'] = df['stand'].map({'R': 1, 'L': 0})

In [None]:
# Creates a new feature, ball_strike, that represents the ball/strike count. 
# Drops the features balls and strikes.
conditions = [
    (df['balls'] == 0) & (df['strikes'] == 0),
    (df['balls'] == 1) & (df['strikes'] == 0),
    (df['balls'] == 2) & (df['strikes'] == 0),
    (df['balls'] == 3) & (df['strikes'] == 0),
    (df['balls'] == 0) & (df['strikes'] == 1),
    (df['balls'] == 0) & (df['strikes'] == 2),
    (df['balls'] == 1) & (df['strikes'] == 1),
    (df['balls'] == 1) & (df['strikes'] == 2),
    (df['balls'] == 2) & (df['strikes'] == 1),
    (df['balls'] == 2) & (df['strikes'] == 2),
    (df['balls'] == 3) & (df['strikes'] == 1),
    (df['balls'] == 3) & (df['strikes'] == 2),
]
choices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
df['ball_strike']  = np.select(conditions, choices)
df = df.drop('balls', axis=1)
df = df.drop('strikes', axis=1)

In [None]:
# Finds all pitch types that occur less than the variable percentage.
# This cell block allows you to modify how many pitch types will be predicted in your model - modify the percentage variable.
df = df[df.pitch_type != 0]
z = df.pitch_type.value_counts() / len(df)
percentage = 0.15
drop_list = [value for value in z if value < percentage]

# Drops rows with pitch types that do not occur often enough
for k, v in z.items():
    for i in drop_list:
        if v == i:
            df = df[df.pitch_type != k]
            
# Prints the pitch types and the amount of each pitch type that remains             
df.pitch_type.value_counts()

In [None]:
# Optional code that allows the user to have equal amounts of data for each pitch type (suggested for one-pitch heavy players).
# Adjust the baseline_value variable to change how occurances you want for each pitch type.
baseline_value = 3000
pitch_list = df.pitch_type.unique()
df_empty = pd.DataFrame()
counter = 1
for i in pitch_list:
    df_pitch = df.loc[df['pitch_type'] == i]
    df_pitch = df_pitch.head(baseline_value)
    if counter == 1:
        df_pitch_final = pd.concat([df_empty, df_pitch])
        counter = counter + 1
    else:
        df_pitch_final = pd.concat([df_pitch_final, df_pitch]) 
df = df_pitch_final

### Random Forest

In [None]:
# Creates the random forest model and fits the data.
# Change the paramaters - max_depth, n_estimators, min_samples_leaf - to achieve optimal results.
# Change the test_size variable to adjust how much data is held out to test your model.
y, label = pd.factorize(df['pitch_type'])
X = df.drop('pitch_type', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
rfs = RandomForestClassifier()
rfs.fit(X_train, y_train)
results = rfs.predict(X_test)

In [None]:
# Shows the feature importance for each input into the model.
list(zip(X_train, rfs.feature_importances_))

In [None]:
# Confustion matrix displayed to show how many true positives were classified.
confusion_matrix(y_test, results)

In [None]:
# Accuracy score shows the rate that the model correctly classified the test data.
accuracy_score(y_test, results)

### Gradient Boosting

In [None]:
# Creates the gradient booster model and fits the data.
# Change the paramater n_estimators and learning_rate for better results.
# Change the test_size variable to adjust how much data is held out to test your model.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
gbs = GradientBoostingClassifier()
gbs.fit(X_train, y_train)
results = gbs.predict(X_test)

In [None]:
# Shows the feature importance for each input into the model.
list(zip(X_train, rfs.feature_importances_))

In [None]:
# Confustion matrix displayed to show how many true positives were classified.
confusion_matrix(y_test, results)

In [None]:
# Accuracy score shows the rate that the model correctly classified the test data.
accuracy_score(y_test, results)

### Parameter tuning

#### GridSearchCV - Random Forest
- GridSearchCV is a good way to test how multiple parameters intereact when adjusted. 

In [None]:
# Sets up the variables and values for the GridSearchCV - all can be changed.
param_grid = { 
    'n_estimators': [200, 700],
    #'max_features': ['sqrt', 'log2'],
    #'criterion' : ['gini', 'entropy'],
    #'max_depth' : [4, 8, 10]
}

In [None]:
# Runs the GridSearchCV
CV_rfs = GridSearchCV(estimator=rfs, param_grid=param_grid, cv= 5, n_jobs=-1)
CV_rfs.fit(X_train, y_train)
CV_rfs.best_params_

In [None]:
# Executes the random forest model with the suggested parameters and prints the accuracy score.
results = CV_rfs.predict(X_test)
accuracy_score(y_test, results)

#### GridSearchCV - Gradient Booster

In [None]:
# Sets up the variables and values for the GridSearchCV - all can be changed.
param_grid = { 
    'learning_rate' : [0.001, 0.01, 0.1, 0.2, 0.3],
    'n_estimators' : [10, 25, 50, 100],
    'max_depth' : [3, 5, 10],
    'criterion' : ['friedman_mse', 'mse', 'mae'],
}

In [None]:
# Runs the GridSearchCV
# Add verbose=10 to see the progress that this cell is making, could take a bit to execute.
CV_gbs = GridSearchCV(estimator=gbs, param_grid=param_grid, cv= 5, n_jobs=-1)
CV_gbs.fit(X_train, y_train)
CV_gbs.best_params_

In [None]:
# Prints a pandas dataframe that shows all of the grid search results. Allows the user to see what parameters actually impact the model robustly.
pd.DataFrame(CV_gbs.cv_results_)

In [None]:
# Executes the random forest model with the suggested parameters and prints the accuracy score.
results = CV_gbs.predict(X_test)
accuracy_score(y_test, results)

#### Individual parameter tuning based on accuracy score - Random Forest
- This can be used to see trends in how certain variables impact the model as well as finding interesting values to plug into the gridsearchcv.

In [None]:
# Number of estimators in the model.
score = []
for number in range(1, 50):
    tree = RandomForestClassifier(n_estimators=number)
    tree.fit(X_train, y_train)
    results = tree.predict(X_test)
    score.append(accuracy_score(y_test, results))
plt.plot(np.arange(1,50,1),score)
plt.xlabel('n_estimators')
plt.ylabel('accuracy_score')
plt.show()

In [None]:
# Max amount of splits for each tree in the model.
score = []
for depth in range(1,11):
    tree = RandomForestClassifier(max_depth=depth)
    tree.fit(X_train, y_train)
    results = tree.predict(X_test)
    score.append(accuracy_score(y_test, results))
plt.plot(np.arange(1,11,1),score)
plt.xlabel('max_depth')
plt.ylabel('accuracy_score')
plt.show()

In [None]:
# How many features are allowed to be used in each tree.
score = []
for number in range(1,9):
    tree = RandomForestClassifier(max_features=number)
    tree.fit(X_train, y_train)
    results = tree.predict(X_test)
    score.append(accuracy_score(y_test, results))
plt.plot(np.arange(1,9,1),score)
plt.xlabel('max_features')
plt.ylabel('accuracy_score')
plt.show()

In [None]:
# How many samples required for a leaf node.
score = []
for number in range(1,200):
    tree = RandomForestClassifier(min_samples_leaf=number)
    tree.fit(X_train, y_train)
    results = tree.predict(X_test)
    score.append(accuracy_score(y_test, results))
plt.plot(np.arange(1,200,1),score)
plt.xlabel('min_samples_leaf')
plt.ylabel('accuracy_score')
plt.show()

#### ROC Curves - Random Forest
- Good for see your true positive rate for each pitch type. The more area under the curve the better.

In [None]:
# Set up the y_testb and y_pred_prob arrays.
# Adjust the variable pitch amount to equal the number of unique pitches your model is predicting
pitch_amount = 4
y_testb = label_binarize(y_test, classes=[0, 1, 2, 3])
y_pred_prob = rfs.predict_proba(X_test)

In [None]:
# Plots the ROC curves and creates a legend with the proper labels.
fprs = []
tprs = []
for i in range(0, pitch_amount):
    fpr, tpr, thresholds = roc_curve(y_testb[:,i], 1-y_pred_prob[:,i])
    fprs.append(fpr)
    tprs.append(tpr)
colors = ['y', 'g', 'r', 'b', 'm', 'c', 'k', 'w']
plt.plot([0, 1], [0, 1], 'k--')
for i in range(0, pitch_amount):
    plt.plot(fprs[i], tprs[i], c=colors[i], label=label[i])
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()