In [41]:
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, recall_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score, KFold 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
import shap

In [84]:
full_df = pd.read_csv('Resources/dash_full_batter_data.csv')
batter_woba_df = pd.read_csv("Resources/full_woba_learning_name.csv")
batter_woba = batter_woba_df.drop(['Name', 'z_scores_avg_woba', 'z_scores_avg_slg', 'z_scores_avg_babip', 'z_scores_avg_wrc+', 'z_scores_wOBA_2023', 'zscore_difference_wOBA_2023' ], axis=1)
batter_woba.head()

Unnamed: 0,Age,G,AB,PA,H,1B,2B,3B,HR,R,...,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR,avg_wOBA,zscore_difference_woba,z_scores_avg_wOBA
0,37.666667,126.0,454.0,502.666667,120.333333,94.0,15.666667,0.0,10.666667,38.0,...,351.333333,0.152667,0.268,0.246,0.395667,0.307,-0.766667,0.299333,-0.163228,-0.841291
1,32.666667,127.666667,436.0,483.666667,113.666667,68.666667,29.666667,4.666667,10.666667,48.0,...,341.333333,0.157,0.266333,0.246,0.374333,0.303333,1.266667,0.322333,-0.575723,-0.106384
2,34.666667,149.0,523.0,617.0,123.0,77.666667,21.0,0.333333,24.0,76.0,...,426.666667,0.166333,0.249,0.254333,0.446333,0.351333,1.766667,0.327333,-0.459862,0.053379
3,32.666667,148.0,532.0,612.333333,131.666667,85.666667,26.666667,1.666667,17.666667,80.0,...,396.0,0.204333,0.289,0.253667,0.425333,0.338333,1.733333,0.323667,0.649372,-0.06378
4,30.666667,139.0,491.0,579.0,125.666667,72.0,24.333333,2.333333,27.0,79.666667,...,403.0,0.168667,0.257,0.265667,0.458333,0.361333,2.733333,0.360333,-1.456233,1.107812


In [85]:
def train_and_evaluate_logistic_regression(df, target_column, solver='lbfgs', max_iter=100):
    # Extract names
    
    # Drop rows with missing values
    df.dropna(axis=1, inplace=True)
    
    # Convert the target column into binary classes (0 or 1)
    df[target_column] = df[target_column].apply(lambda x: 1 if x > 0 else 0)
    
    # Split data into features (X) and target (y)
    X = df.drop([target_column], axis=1)  # Exclude 'Name'
    y = df[target_column]
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    # Apply SMOTE for oversampling
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Standardize features using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_test)
    
    # Create the Logistic Regression model
    logreg_model = LogisticRegression(solver=solver, max_iter=max_iter)
    
    # Train the model
    logreg_model.fit(X_train_scaled, y_train_resampled)
    
    # Make predictions on the test set
    y_pred = logreg_model.predict(X_test_scaled)
    
    # Create the SHAP explainer with the trained model
    explainer = shap.Explainer(logreg_model, X_train_scaled)
    
    # Calculate SHAP values
    shap_values = explainer(X_test_scaled)
    
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "balanced_recall": balanced_accuracy_score(y_test, y_pred),
        "shap_values": shap_values,
        "X_test_scaled": X_test_scaled,
        "y_test": y_test,
        "y_pred": y_pred,
        "X_train": X_train,
        "X_train_scaled": X_train_scaled,
        "y_train_resampled": y_train_resampled,
        "X_test": X_test  # Add this line to include X_test in the result
    }

In [86]:
# Assuming you have a DataFrame named batter_woba_df
target_column = 'zscore_difference_woba'
result_woba = train_and_evaluate_logistic_regression(batter_woba, target_column, solver='lbfgs', max_iter=200)

# Print the results
print(f"Accuracy: {result_woba['accuracy']:.2f}")
print(f"Balanced Recall: {result_woba['balanced_recall']:.2f}")

Accuracy: 0.62
Balanced Recall: 0.62


In [82]:
# # Get the predictions and actual labels for both training and testing sets
# y_pred_train = result_woba['y_train_resampled']
# y_pred_test = result_woba['y_pred']
# y_train = result_woba['y_train_resampled']
# y_test = result_woba['y_test']

# # Get the indexes of the DataFrames X_train and X_test
# indexes_train = result_woba['X_train'].index
# indexes_test = result_woba['X_test'].index

# # Attach the 'Name' feature to the predictions for training set
# names_train = full_df.loc[indexes_train, 'Name']
# predictions_with_names_train = list(zip(names_train, y_pred_train, y_train))

# # Attach the 'Name' feature to the predictions for testing set
# names_test = full_df.loc[indexes_test, 'Name']
# predictions_with_names_test = list(zip(names_test, y_pred_test, y_test))

# # Create DataFrames for the predictions with names for training and testing sets
# predictions_df_train = pd.DataFrame(predictions_with_names_train, columns=['Name', 'Prediction', 'Actual'])
# predictions_df_test = pd.DataFrame(predictions_with_names_test, columns=['Name', 'Prediction', 'Actual'])

# # Concatenate the DataFrames vertically
# pred_actual_df = pd.concat([predictions_df_train, predictions_df_test], ignore_index=True)

In [91]:
def create_predictions_dataframe(result_df, name_df):
    # Get the predictions and actual labels for both training and testing sets
    y_pred_train = result_df['y_train_resampled']
    y_pred_test = result_df['y_pred']
    y_train = result_df['y_train_resampled']
    y_test = result_df['y_test']

    # Get the indexes of the DataFrames X_train and X_test
    indexes_train = result_df['X_train'].index
    indexes_test = result_df['X_test'].index

    # Attach the 'Name' feature to the predictions for training set
    names_train = name_df.loc[indexes_train, 'Name']
    predictions_with_names_train = list(zip(names_train, y_pred_train, y_train))

    # Attach the 'Name' feature to the predictions for testing set
    names_test = name_df.loc[indexes_test, 'Name']
    predictions_with_names_test = list(zip(names_test, y_pred_test, y_test))

    # Create DataFrames for the predictions with names for training and testing sets
    predictions_df_train = pd.DataFrame(predictions_with_names_train, columns=['Name', 'Prediction', 'Actual'])
    predictions_df_test = pd.DataFrame(predictions_with_names_test, columns=['Name', 'Prediction', 'Actual'])

    # Concatenate the DataFrames vertically
    pred_actual_df = pd.concat([predictions_df_train, predictions_df_test], ignore_index=True)
    
    return pred_actual_df

In [94]:
pred_actual_woba = create_predictions_dataframe(result_woba, full_df)

# Write the DataFrame to a CSV file
pred_actual_woba.to_csv('predictions_woba.csv', index=False)
