This notebook will be testing different classification models and comparing them

In [9]:

import pandas as pd
import numpy as np
df = pd.read_csv('..\data\max_fried.csv')
df.head()

Unnamed: 0,balls,strikes,outs_when_up,inning,at_bat_number,pitch_number,home_score,away_score,bat_score,n_thruorder_pitcher,...,pitch_type_map,prop_CU,prop_FC,prop_FF,prop_SI,prop_CH,prop_SL,batter_is_right,pitcher_is_right,inning_top
0,0,2,2,7,55,3,0,5,0,3,...,0.0,0.276364,0.130909,0.290909,0.130909,0.163636,0.007273,0,0,0
1,0,1,2,7,55,2,0,5,0,3,...,5.0,0.276364,0.130909,0.290909,0.130909,0.163636,0.007273,0,0,0
2,0,0,2,7,55,1,0,5,0,3,...,0.0,0.276364,0.130909,0.290909,0.130909,0.163636,0.007273,0,0,0
3,3,2,2,7,54,6,0,5,0,3,...,5.0,0.276364,0.130909,0.290909,0.130909,0.163636,0.007273,1,0,0
4,2,2,2,7,54,5,0,5,0,3,...,3.0,0.276364,0.130909,0.290909,0.130909,0.163636,0.007273,1,0,0


In [None]:
#split into train-test-validate

from sklearn.model_selection import train_test_split, cross_val_score

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Metrics
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_curve,
    auc,
    confusion_matrix,
    classification_report,
)

# Plotting (for ROC curve and confusion matrix)
import matplotlib.pyplot as plt
import seaborn as sns

X = df.drop(columns=["pitch_type_map"])
y = df["pitch_type_map"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
#Model 1 Multiple logistic regression
lr_model = LogisticRegression(random_state=42, max_iter=10000)
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

# Evaluation
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_predictions))
print("Logistic Regression F1 Score:", f1_score(y_test, lr_predictions, average='weighted'))

Logistic Regression Accuracy: 0.4098360655737705
Logistic Regression F1 Score: 0.35019870839542966


In [None]:
#model 2 KNN
highest_acc = 0
highest_f1 = 0
best_k = 0
for i in range(1,30):
    knn_model = KNeighborsClassifier(n_neighbors= i)
    knn_model.fit(X_train, y_train)
    knn_predictions = knn_model.predict(X_test)
    acc =  accuracy_score(y_test, knn_predictions)
    f1 = f1_score(y_test, knn_predictions, average='weighted')
    if acc > highest_acc:
        highest_f1 = f1
        highest_acc = acc
        best_k = i
    
    # print(f"KNN Accuracy for {i}:", accuracy_score(y_test, knn_predictions))
    # print("KNN F1 Score:", f1_score(y_test, knn_predictions, average='weighted'))
print('Best K for KNN: ', best_k)
print('KNN Accuracy: ', highest_acc)
print('KNN F1 Score: ', highest_acc)


    


KNN Accuracy for 1: 0.26229508196721313
KNN Accuracy for 2: 0.3360655737704918
KNN Accuracy for 3: 0.3114754098360656
KNN Accuracy for 4: 0.30327868852459017
KNN Accuracy for 5: 0.32786885245901637
KNN Accuracy for 6: 0.319672131147541
KNN Accuracy for 7: 0.3524590163934426
KNN Accuracy for 8: 0.36885245901639346
KNN Accuracy for 9: 0.3770491803278688
KNN Accuracy for 10: 0.36065573770491804
KNN Accuracy for 11: 0.3770491803278688
KNN Accuracy for 12: 0.3770491803278688
KNN Accuracy for 13: 0.36065573770491804
KNN Accuracy for 14: 0.3442622950819672
KNN Accuracy for 15: 0.36885245901639346
KNN Accuracy for 16: 0.3442622950819672
KNN Accuracy for 17: 0.319672131147541
KNN Accuracy for 18: 0.319672131147541
KNN Accuracy for 19: 0.319672131147541


In [None]:
#Model 3 Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


rf = RandomForestClassifier()
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}


grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search.best_params_}")


rf_model = RandomForestClassifier(n_estimators=10000, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# Evaluation
print("Random Forest Accuracy:", accuracy_score(y_test, rf_predictions))
print("Random Forest F1 Score:", f1_score(y_test, rf_predictions, average='weighted'))

Random Forest Accuracy: 0.32786885245901637
Random Forest F1 Score: 0.32357716637521744


In [None]:
#Model 4 XGBoost

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Define the model
xgb = XGBClassifier()

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 6, 10],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 0.1, 1]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search
grid_search.fit(X_train, y_train)

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=45)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, xgb_predictions))
print("XGBoost F1 Score:", f1_score(y_test, xgb_predictions, average='weighted'))

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.4098360655737705
XGBoost F1 Score: 0.414385587115755


Validating on some different pitchers

In [77]:
from sklearn.preprocessing import LabelEncoder

df_val = pd.read_csv('/Users/cstone/Documents/Projects/MLB_Pitch_Prediction/data/raw/charlie_morton.csv')


cols = ['balls', 'strikes', 'outs_when_up', 'batter_is_right', 'pitcher_is_right', 'n_thruorder_pitcher', 'pitch_type_map']
df_val = df_val[cols]

Xval = df_val.drop(columns=['pitch_type_map'])
yval = df_val['pitch_type_map']



Xvaltrain, Xvaltest, yvaltrain, yvaltest = train_test_split(Xval, yval, test_size=0.25, random_state=41)
label_encoder = LabelEncoder()
yvaltrain_encoded = label_encoder.fit_transform(yvaltrain)
# Initialize the XGBClassifier
xgb_val_model = XGBClassifier(eval_metric='logloss', random_state=42)

# Fit the model
xgb_val_model.fit(Xvaltrain, yvaltrain_encoded)

# Make predictions
xgb_val_predictions = xgb_val_model.predict(Xvaltest)

# Evaluate the model
print("XGBoost Accuracy:", accuracy_score(yvaltest, xgb_val_predictions))
print("XGBoost F1 Score:", f1_score(yvaltest, xgb_val_predictions, average='weighted'))

XGBoost Accuracy: 0.4075757575757576
XGBoost F1 Score: 0.36542222858537105


In [None]:
#Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=10000)
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

#KNN
highest_acc = 0
highest_f1 = 0
best_k = 0
for i in range(1,30):
    knn_model = KNeighborsClassifier(n_neighbors= i)
    knn_model.fit(X_train, y_train)
    knn_predictions = knn_model.predict(X_test)
    acc =  accuracy_score(y_test, knn_predictions)
    f1 = f1_score(y_test, knn_predictions, average='weighted')
    if acc > highest_acc:
        highest_f1 = f1
        highest_acc = acc
        best_k = i


#Random Forest
rf_model = RandomForestClassifier(n_estimators=10000, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# Evaluations
print("Random Forest Accuracy:", accuracy_score(y_test, rf_predictions))
print("Random Forest F1 Score:", f1_score(y_test, rf_predictions, average='weighted'))

print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_predictions))
print("Logistic Regression F1 Score:", f1_score(y_test, lr_predictions, average='weighted'))

print('Best K for KNN: ', best_k)
print('KNN Accuracy: ', highest_acc)
print('KNN F1 Score: ', highest_acc)

print("Random Forest Accuracy:", accuracy_score(y_test, rf_predictions))
print("Random Forest F1 Score:", f1_score(y_test, rf_predictions, average='weighted'))


array([0, 3, 0, 3, 3, 0, 3, 3, 3, 3, 3, 2, 0, 3, 0, 3, 3, 3, 4, 3, 3, 3,
       0, 2, 3, 3, 0, 0, 4, 0, 3, 3, 3, 0, 3, 0, 3, 3, 3, 0, 3, 3, 3, 3,
       0, 3, 3, 3, 3, 4, 3, 3, 3, 3, 0, 0, 3, 3, 3, 0, 3, 0, 3, 3, 3, 3,
       3, 3, 0, 0, 3, 3, 4, 0, 3, 0, 2, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3,
       3, 4, 3, 1, 0, 3, 3, 3, 0, 0, 3, 3, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3,
       2, 0, 4, 3, 0, 3, 0, 0, 3, 3, 3, 2, 3, 3, 3, 3, 0, 3, 0, 3, 3, 3,
       3, 3, 3, 0, 3, 3, 4, 1, 0, 3, 3, 0, 0, 3, 0, 0, 3, 3, 3, 3, 3, 4,
       3, 3, 3, 0, 3, 3, 3, 0, 0, 0, 3, 2, 3, 2, 0, 3, 3, 3, 0, 3, 3, 0,
       0, 0, 3, 0, 3, 3, 3, 3, 0, 2, 0, 0, 0, 0, 3, 0, 0, 3, 3, 2, 3, 3,
       3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 4, 0, 3, 3, 3, 0, 0, 0, 3,
       3, 0, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 0, 3, 0, 0, 3, 0, 0, 3, 0,
       4, 0, 3, 0, 3, 3, 3, 3, 3, 0, 3, 0, 3, 1, 0, 3, 3, 3, 3, 0, 4, 0,
       0, 0, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 3, 3, 0, 3, 0, 0, 3,
       0, 3, 3, 3, 3, 3, 3, 0, 2, 3, 2, 3, 0, 3, 4,

xgb_val_predictions

In [59]:
print("Training classes:", np.unique(yvaltrain))
print("Training classes:", np.unique(yvaltrain_encoded))

print("Testing classes:", np.unique(yvaltest))

Training classes: [0 1 2 3 5]
Training classes: [0 1 2 3 4]
Testing classes: [0 1 2 3 5]


In [None]:
# Accuracy

# F1-Score

# Confusion Matrice

# ROC Cruve