In [None]:
# Add ../ to the path 
import sys
sys.path.append('../')

In [None]:
import pandas as pd 
import numpy as np
from lime.lime_tabular import LimeTabularExplainer
import shap

from xgboost_model import XBGBaseballModel
from matplotlib import pyplot as plt

from data.baseball_data_loader import BaseballDataLoader
from data.baseball_data_model import get_feature_names

train_data_path = ["../csv_data/2021_data.csv", "../csv_data/2022_data.csv", "../csv_data/2023_data.csv"]
train_loader = BaseballDataLoader(train_data_path)
X_train, y_train = train_loader.get_training_data()

eval_data_path = ["../csv_data/2024_data.csv"]
loader = BaseballDataLoader(eval_data_path)
X_eval, y_eval = loader.get_training_data()
model_wrapper = XBGBaseballModel.from_path("../trained_models/baseball_xgb.json")
xgb_model = model_wrapper.model

feature_names = get_feature_names(X_eval[0], path="../csv_data/")

X_eval_df = pd.DataFrame(X_eval, columns=feature_names)

In [None]:
## XGBoost model interpretability
df_fi = pd.DataFrame()
df_fi['feature'] = feature_names
df_fi['importance'] = xgb_model.feature_importances_

df_fi.sort_values('importance', ascending=False).head(10)

In [None]:
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_eval_df)
f = plt.figure()
shap.summary_plot(shap_values, X_eval_df, plot_type='dot', show=True, plot_size=[10,6])

In [None]:
astros = loader.find_game(team_name="Houston Astros", date="2024-07-27 6:10PM")
dodgers = loader.find_game(team_name="Los Angeles Dodgers", date="2024-07-27 6:10PM")


astros_features = [float(x) for x in astros.team_features]
astros_predicted_runs = model_wrapper.predict(astros_features)

dodgers_features = [float(x) for x in dodgers.team_features]
dodgers_predicted_runs = model_wrapper.predict(dodgers_features)

print(f"Astros predicted runs: {astros_predicted_runs}")
print(f"Dodgers predicted runs: {dodgers_predicted_runs}")



In [None]:
games = ["2024-07-26 7:10PM", "2024-07-27 6:10PM", "2024-07-28 1:10PM"]

print("DODGERS @ HOUSTON SERIES")
for game in games:
    astros = loader.find_game(team_name="Houston Astros", date=game)
    dodgers = loader.find_game(team_name="Los Angeles Dodgers", date=game)

    astros_features = [float(x) for x in astros.team_features]
    astros_predicted_runs = model_wrapper.predict(astros_features)

    dodgers_features = [float(x) for x in dodgers.team_features]
    dodgers_predicted_runs = model_wrapper.predict(dodgers_features)

    # print(f"Astros predicted runs: {astros_predicted_runs}")
    # print(f"Dodgers predicted runs: {dodgers_predicted_runs}")

    print(f"\n{game}")
    print(f"PREDICTED: {dodgers_predicted_runs} vs {astros_predicted_runs}")
    print(f"ACTUAL: {dodgers.team_score} vs {astros.team_score}")

In [None]:
import numpy as np
from lime.lime_tabular import LimeTabularExplainer

# Select the second game
game = "2024-07-27 6:10PM"

# Load game data for Astros and Dodgers
astros = loader.find_game(team_name="Houston Astros", date=game)
dodgers = loader.find_game(team_name="Los Angeles Dodgers", date=game)

# Convert team features to NumPy arrays
astros_features = np.array([float(x) for x in astros.team_features])
dodgers_features = np.array([float(x) for x in dodgers.team_features])

astros_predicted_runs = model_wrapper.predict(astros_features)
dodgers_predicted_runs = model_wrapper.predict(dodgers_features)

class NonNegativeLimeTabularExplainer(LimeTabularExplainer):
    def data_inverse(self, scaled_data):
        # Ensure that inverse transformed data is non-negative
        data = super().data_inverse(scaled_data)
        data = np.clip(data, a_min=0, a_max=None)
        return data

    def explain_instance(self, data_row, predict_fn, *args, **kwargs):
        # Override to use custom sampling that respects non-negativity
        data_row = np.clip(data_row, a_min=0, a_max=None)
        return super().explain_instance(data_row, predict_fn, *args, **kwargs)


# Define a prediction function compatible with LIME
def predict_fn(input_data):
    return model_wrapper.model.predict(input_data)


# Initialize the LIME explainer with training data
explainer = NonNegativeLimeTabularExplainer(
    training_data=X_train,
    training_labels=y_train,
    feature_names=feature_names,
    mode='regression',
    discretize_continuous=False,
    sample_around_instance=True
)

# Generate explanations for the Astros
exp_astros = explainer.explain_instance(
    data_row=astros_features,
    predict_fn=predict_fn
)

# Generate explanations for the Dodgers
exp_dodgers = explainer.explain_instance(
    data_row=dodgers_features,
    predict_fn=predict_fn
)

# Print the explanations for Astros
print("Astros Prediction Explanation:")
for feature, weight in exp_astros.as_list():
    print(f"{feature}: {weight:.4f}")

# Print the explanations for Dodgers
print("\nDodgers Prediction Explanation:")
for feature, weight in exp_dodgers.as_list():
    print(f"{feature}: {weight:.4f}")

In [None]:
fig_astros = exp_astros.as_pyplot_figure()
plt.title(f"Lime Explination - Astros Score = {round(float(astros_predicted_runs),2)}")
plt.show()

In [None]:
fig_dodgers = exp_dodgers.as_pyplot_figure()
plt.title(f"Lime Explination - Dodgers Score = {round(float(dodgers_predicted_runs),2)}")
plt.show()