# Pitch Sequence and Location ML Model: Decision Tree
This model uses Decision Treees to predict the type and location of a pitch thrown by Paul Skenes.

In [1]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn import tree

import warnings
# Suppress overflow warnings from sklearn/numpy
warnings.filterwarnings("ignore", category=RuntimeWarning)
np.seterr(over='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

## Data Loading
Load data and check with head()

In [2]:
df = pd.read_csv('PaulSkenes_data.csv')
df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
0,FF,9/16/2025,97.8,-2.41,5.51,"Skenes, Paul",663538,694973,single,hit_into_play,...,1.0,1.59,1.26,1.26,20.4,3.667252,10.051646,19.332678,26.925733,19.079675
1,SL,9/16/2025,86.2,-2.31,5.43,"Skenes, Paul",663538,694973,,ball,...,1.0,2.46,-0.67,-0.67,24.1,,,,,
2,CH,9/16/2025,89.1,-2.58,5.42,"Skenes, Paul",663538,694973,,foul,...,1.0,2.61,1.38,1.38,17.7,23.239272,-35.861707,38.36165,21.094334,39.365557
3,SL,9/16/2025,84.9,-2.5,5.46,"Skenes, Paul",663538,694973,,foul,...,1.0,2.52,0.01,0.01,23.6,11.419466,-20.163636,24.883915,28.004997,34.737746
4,FF,9/16/2025,97.5,-2.35,5.56,"Skenes, Paul",663538,694973,,foul,...,1.0,1.22,0.88,0.88,20.3,-1.36564,13.799354,25.743627,33.916405,21.652949


## Feature Engineering

Conver the target catergorical features into numerical features.

In [3]:
pitch_le = LabelEncoder()
zone_le = LabelEncoder()

df["pitch_type_encoded"] = pitch_le.fit_transform(df["pitch_type"])
df["zone_encoded"] = zone_le.fit_transform(df["zone"])

Separate the necessary featyresinto numerical and categorical features.  
Categorical features will need to be converted into numerical features.

In [4]:
numerical = [
    'balls', 'strikes', 'outs_when_up', 'inning', 'pitch_number', 'n_priorpa_thisgame_player_at_bat', 'n_thruorder_pitcher', 
    'on_1b', 'on_2b', 'on_3b'
]

categorical = [
    'stand'
]

Convert batter handedness into numerical with get_dummies.

In [5]:
df = pd.get_dummies(df, columns=['stand'], prefix='stand')

Combine the three on_base features into one numerical boolean feature.

In [6]:
df['on_base'] = df[['on_1b', 'on_2b', 'on_3b']].notna().any(axis=1).astype(int)
df = df.drop(columns=['on_1b', 'on_2b', 'on_3b'])

The features that will be used by the model to train/test.

In [7]:
cols = [
    'balls', 'strikes', 'outs_when_up', 'inning', 'pitch_number', 'n_priorpa_thisgame_player_at_bat', 'n_thruorder_pitcher', 
    'stand_L', 'stand_R', 'on_base'
]

X are the features, and y is an array that contains both the pitch type and zone, the target variables.

In [8]:
X = df[cols]

y = df[["pitch_type_encoded", "zone_encoded"]]
y = np.array(y)

## Train/Test Split
Split data into training and test sets (80/20 split, with random seed of 42 for reproducible results). 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Training the Model
Define all the parameters that GridSearch will search. GridSearch will search through every combination and return the best performing model.

In [10]:
param_grid = {
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

Define tree model and start the GridSearch

In [11]:
multi_tree = tree.DecisionTreeClassifier(random_state=42)

grid = GridSearchCV(
    estimator=multi_tree,
    scoring='accuracy',
    cv=3,
    param_grid=param_grid,
    n_jobs=-1
)

multi_tree.fit(X_train, y_train)

Print the best cross-validation score and best parameters found:

In [12]:
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_params_)

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/utils/_param_validation.py", line 216, in wrapper
    return func(*args, **kwargs)
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/metrics/_classification.py", line 227, in accuracy_score
    y

nan
{'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2}


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/utils/_param_validation.py", line 216, in wrapper
    return func(*args, **kwargs)
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/metrics/_classification.py", line 227, in accuracy_score
    y

Refit the multi-tree with the best parameters.

In [13]:
multi_tree = DecisionTreeClassifier(
    criterion="gini",
    max_depth=2,
    min_samples_split=2
)

multi_tree.fit(X_train, y_train)

pred contains the trained output from the tree (which is an array). Split the array into pitch type and zone.

In [14]:
pred = multi_tree.predict(X_test)

pitch_pred_encoded = pred[:, 0]
zone_pred_encoded = pred[:, 1]

Evaluate the model's accuracy for pitch type and zone. Compare the true labels to the model's predictions.
Print the accuracy scores.

In [15]:
pitch_acc = accuracy_score(y_test[:, 0], pred[:, 0])
zone_acc = accuracy_score(y_test[:, 1], pred[:, 1])

print("Pitch Type Accuracy:", pitch_acc)
print("Zone Accuracy:", zone_acc)

Pitch Type Accuracy: 0.40308747855917665
Zone Accuracy: 0.16638078902229847
