### Binary Pitch Model
This notebook contains models that will take pitch information for a single player or a list list of players and will then train itself to predict future pitches that are thrown. This model's goal is to simulate the mindset of a batter approaching an at bat. Many times a hitter will first decide to either look fastball or off-speed.
The notebook contains code for the following:
1. Single Player Random Forest Model
2. Single Player Gradient Boosting Model
3. Multi-Player Random Forest
4. Multi-Player Gradient Booster

Note: Please see the single player model for parameter tuning guidelines.

In [None]:
import numpy as np
import pandas as pd
import argparse
import matplotlib.pyplot as plt
from pybaseball import pitching_stats
from pybaseball import playerid_lookup
from pybaseball import statcast_pitcher
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

## Single Player

In [None]:
# Retrieve player data from baseballsavant via pybaseball api.
# Change the four variables below to select a specific player and date range.
first_name = 'felix'
last_name = 'hernandez'
start_date = '2010-01-01'
end_date = '2017-11-07'
id = playerid_lookup(last_name, first_name)
id_number = id.key_mlbam[0]
felix = statcast_pitcher(start_dt=start_date, end_dt=end_date, player_id=id_number)

In [None]:
# Look at the data structure and provided features
felix.head(1)

In [None]:
# Chooses the features that will be used in the model.
df = felix[['pitch_type', 'balls', 'strikes', 'stand', 'outs_when_up', 'inning', 'on_3b', 'on_2b', 'on_1b', 'at_bat_number', 'pitch_number']]

In [None]:
# Replaces null values and maps some of the features to 1 and 0.
df = df.replace('null', 0)
df['on_1b'] = df['on_1b'].map(lambda x: 1 if x != 0 else 0)
df['on_2b'] = df['on_2b'].map(lambda x: 1 if x != 0 else 0)
df['on_3b'] = df['on_3b'].map(lambda x: 1 if x != 0 else 0)
df['stand'] = df['stand'].map({'R': 1, 'L': 0})

In [None]:
# Encodes binary variables for the ball/strike count. 
# Drops the features balls and strikes.
df['zero_zero'] = ((df.balls == 0) & (df.strikes == 0)).map({True:1, False:0})
df['zero_one'] = ((df.balls == 0) & (df.strikes == 1)).map({True:1, False:0})
df['zero_two'] = ((df.balls == 0) & (df.strikes == 2)).map({True:1, False:0})
df['one_zero'] = ((df.balls == 1) & (df.strikes == 0)).map({True:1, False:0})
df['one_one'] = ((df.balls == 1) & (df.strikes == 1)).map({True:1, False:0})
df['one_two'] = ((df.balls == 1) & (df.strikes == 2)).map({True:1, False:0})
df['two_zero'] = ((df.balls == 2) & (df.strikes == 0)).map({True:1, False:0})
df['two_one'] = ((df.balls == 2) & (df.strikes == 1)).map({True:1, False:0})
df['two_two'] = ((df.balls == 2) & (df.strikes == 2)).map({True:1, False:0})
df['three_zero'] = ((df.balls == 3) & (df.strikes == 0)).map({True:1, False:0})
df['three_one'] = ((df.balls == 3) & (df.strikes == 1)).map({True:1, False:0})
df['three_two'] = ((df.balls == 3) & (df.strikes == 2)).map({True:1, False:0})
df = df.drop('balls', axis=1)
df = df.drop('strikes', axis=1)

In [None]:
# Maps all pitches to either fastball or offspeed
df['pitch_type'] = df['pitch_type'].map(lambda x: 1 if x != 'FF' else 0)

In [None]:
# Optional code that allows the user to have equal amounts of data for each pitch type (suggested for one-pitch heavy players).
# Adjust the baseline_value variable to change how occurances you want for each pitch type.
baseline_value = 3000
pitch_list = df.pitch_type.unique()
df_empty = pd.DataFrame()
counter = 1
for i in pitch_list:
    df_pitch = df.loc[df['pitch_type'] == i]
    df_pitch = df_pitch.head(baseline_value)
    if counter == 1:
        df_pitch_final = pd.concat([df_empty, df_pitch])
        counter = counter + 1
    else:
        df_pitch_final = pd.concat([df_pitch_final, df_pitch]) 
df = df_pitch_final

### Random Forest Single Player (Binary)

In [None]:
# Creates the random forest model and fits the data.
# Change the paramaters - max_depth, n_estimators, min_samples_leaf - to achieve optimal results.
# Change the test_size variable to adjust how much data is held out to test your model.
y, label = pd.factorize(df['pitch_type'])
X = df.drop('pitch_type', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
rfs = RandomForestClassifier()
rfs.fit(X_train, y_train)
results = rfs.predict(X_test)

In [None]:
# Shows the feature importance for each input into the model.
list(zip(X_train, rfs.feature_importances_))

In [None]:
# Confustion matrix displayed to show how many true positives were classified.
confusion_matrix(y_test, results)

In [None]:
# Accuracy score shows the rate that the model correctly classified the test data.
accuracy_score(y_test, results)

### Gradient Booster Single Player (Binary)

In [None]:
# Creates the gradient booster model and fits the data.
# Change the paramater n_estimators and learning_rate for better results.
# Change the test_size variable to adjust how much data is held out to test your model.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
gbs = GradientBoostingClassifier(n_estimators=1000)
gbs.fit(X_train, y_train)
results = gbs.predict(X_test)

In [None]:
# Shows the feature importance for each input into the model.
list(zip(X_train, rfs.feature_importances_))

In [None]:
# Confustion matrix displayed to show how many true positives were classified.
confusion_matrix(y_test, results)

In [None]:
# Accuracy score shows the rate that the model correctly classified the test data.
accuracy_score(y_test, results)

## Multiplayer

In [None]:
# This block takes a list of pitchers (first name last name, with no commas separating names).
# The list is saved in a text file and the name can be put for the file variable. There is no limit to the amount of names.
file = 'player_list.txt'
f = open(file)
text = f.read().split()
player_dict = {text[i]: text[i+1] for i in range(0, len(text), 2)}
counter = 0
for key, value in player_dict.items():
    print(key)
    id = playerid_lookup(value, key)
    id_number = id.key_mlbam[0]
    if counter < 1:
        new_df = statcast_pitcher(start_dt='2010-01-01', end_dt='2017-11-07', player_id=id_number)
        counter = counter + 2
    if counter > 1:
        old_df = statcast_pitcher(start_dt='2010-01-01', end_dt='2017-11-07', player_id=id_number)
        new_df = pd.concat([old_df, new_df])

In [None]:
# Chooses the features that will be used in the model.
df = new_df[['pitch_type', 'balls', 'strikes', 'stand', 'outs_when_up', 'inning', 'on_3b', 'on_2b', 'on_1b', 'at_bat_number', 'pitch_number']]

In [None]:
# Replaces null values and maps some of the features to 1 and 0.
df = df.replace('null', 0)
df['on_1b'] = df['on_1b'].map(lambda x: 1 if x != 0 else 0)
df['on_2b'] = df['on_2b'].map(lambda x: 1 if x != 0 else 0)
df['on_3b'] = df['on_3b'].map(lambda x: 1 if x != 0 else 0)
df['stand'] = df['stand'].map({'R': 1, 'L': 0})

In [None]:
# Creates a new feature, ball_strike, that represents the ball/strike count. 
# Drops the features balls and strikes.
conditions = [
    (df['balls'] == 0) & (df['strikes'] == 0),
    (df['balls'] == 1) & (df['strikes'] == 0),
    (df['balls'] == 2) & (df['strikes'] == 0),
    (df['balls'] == 3) & (df['strikes'] == 0),
    (df['balls'] == 0) & (df['strikes'] == 1),
    (df['balls'] == 0) & (df['strikes'] == 2),
    (df['balls'] == 1) & (df['strikes'] == 1),
    (df['balls'] == 1) & (df['strikes'] == 2),
    (df['balls'] == 2) & (df['strikes'] == 1),
    (df['balls'] == 2) & (df['strikes'] == 2),
    (df['balls'] == 3) & (df['strikes'] == 1),
    (df['balls'] == 3) & (df['strikes'] == 2),
]
choices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
df['ball_strike']  = np.select(conditions, choices)
df = df.drop('balls', axis=1)
df = df.drop('strikes', axis=1)

In [None]:
# Finds all pitch types that occur less than the variable percentage.
# This cell block allows you to modify how many pitch types will be predicted in your model - modify the percentage variable.
df = df[df.pitch_type != 0]
z = df.pitch_type.value_counts() / len(df)
percentage = 0.05
drop_list = [value for value in z if value < percentage]

# Drops rows with pitch types that do not occur often enough
for k, v in z.items():
    for i in drop_list:
        if v == i:
            df = df[df.pitch_type != k]
            
# Prints the pitch types and the amount of each pitch type that remains             
df.pitch_type.value_counts()

In [None]:
# Maps all pitches to either fastball or offspeed
df['pitch_type'] = df['pitch_type'].map(lambda x: 1 if x != 'FF' else 0)

In [None]:
# Optional code that allows the user to have equal amounts of data for each pitch type (suggested for one-pitch heavy players).
# Adjust the baseline_value variable to change how occurances you want for each pitch type.
baseline_value = 20000
pitch_list = df.pitch_type.unique()
df_empty = pd.DataFrame()
counter = 1
for i in pitch_list:
    df_pitch = df.loc[df['pitch_type'] == i]
    df_pitch = df_pitch.head(baseline_value)
    if counter == 1:
        df_pitch_final = pd.concat([df_empty, df_pitch])
        counter = counter + 1
    else:
        df_pitch_final = pd.concat([df_pitch_final, df_pitch]) 
df = df_pitch_final

### Random Forest Multi-Player (Binary)

In [None]:
# Creates the random forest model and fits the data.
# Change the paramaters - max_depth, n_estimators, min_samples_leaf - to achieve optimal results.
# Change the test_size variable to adjust how much data is held out to test your model.
y, label = pd.factorize(df['pitch_type'])
X = df.drop('pitch_type', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
rfs = RandomForestClassifier()
rfs.fit(X_train, y_train)
results = rfs.predict(X_test)

In [None]:
# Shows the feature importance for each input into the model.
list(zip(X_train, rfs.feature_importances_))

In [None]:
# Confustion matrix displayed to show how many true positives were classified.
confusion_matrix(y_test, results)

In [None]:
# Accuracy score shows the rate that the model correctly classified the test data.
accuracy_score(y_test, results)

### Gradient Booster Multi-Player (Binary)

In [None]:
# Creates the gradient booster model and fits the data.
# Change the paramater n_estimators and learning_rate for better results.
# Change the test_size variable to adjust how much data is held out to test your model.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
gbs = GradientBoostingClassifier(n_estimators=1000)
gbs.fit(X_train, y_train)
results = gbs.predict(X_test)

In [None]:
# Shows the feature importance for each input into the model.
list(zip(X_train, rfs.feature_importances_))

In [None]:
# Confustion matrix displayed to show how many true positives were classified.
confusion_matrix(y_test, results)

In [None]:
# Accuracy score shows the rate that the model correctly classified the test data.
accuracy_score(y_test, results)