In [None]:
# add_xBA.py: Add xBA column to 2025.csv for all hit types and summarize by batter

# Import libraries
import pandas as pd
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt
import seaborn as sns

# File paths
data_path = 'CornBelters/Data/BeesBelters5-28.csv'
output_path = 'Data/2025_with_xBA.csv'
summary_path = 'Data/batter_xBA_summary.csv'
plot_dir = '/xBA/'

# Create plot directory if it doesn't exist
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)

# Define features used by the xBA models
features = ['ExitSpeed', 'Angle', 'Direction', 'HitSpinRate', 'Distance', 'Bearing', 'HangTime']

# Load models and scalers for each hit type
hit_types = ['GroundBall', 'Popup', 'LineDrive', 'FlyBall']
models = {}
scalers = {}
for hit_type in hit_types:
    model_path = f'xgb_model_{hit_type}.pkl'
    scaler_path = f'scaler_{hit_type}.pkl'
    try:
        with open(model_path, 'rb') as f:
            models[hit_type] = pickle.load(f)
        with open(scaler_path, 'rb') as f:
            scalers[hit_type] = pickle.load(f)
    except FileNotFoundError as e:
        print(f"Error: {e}. Ensure xBA.py has been run to generate {model_path} and {scaler_path}.")
        exit(1)

# Load dataset
try:
    df = pd.read_csv(data_path, low_memory=False)
except FileNotFoundError:
    print(f"Error: {data_path} not found.")
    exit(1)

# Inspect dataset
print("Dataset columns:", df.columns.tolist())
print("Dataset shape:", df.shape)
print("Unique TaggedHitType values:", df['TaggedHitType'].unique())
print("Unique PlayResult values:", df['PlayResult'].unique())

# Ensure features are numeric
for col in features:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Check for missing features
missing_cols = [col for col in features if col not in df.columns]
if missing_cols:
    print(f"Error: Missing columns {missing_cols} in dataset.")
    exit(1)

# Function to predict xBA for a row
def predict_xba(row, models, scalers, features):
    hit_type = row['TaggedHitType']
    if hit_type not in models or pd.isna(row[features]).any():
        return np.nan
    input_data = row[features].values.reshape(1, -1)
    input_scaled = scalers[hit_type].transform(input_data)
    return models[hit_type].predict_proba(input_scaled)[0, 1]

# Filter batted ball events
batted_balls = df['TaggedHitType'].isin(hit_types)

# Predict xBA for batted balls
print("Predicting xBA for batted balls...")
df['xBA'] = df[batted_balls].apply(lambda row: predict_xba(row, models, scalers, features), axis=1)

# Move xBA to the end
cols = [col for col in df.columns if col != 'xBA'] + ['xBA']
df = df[cols]

# Check xBA results
print("xBA column added. Summary:")
print(df['xBA'].describe())
print("Missing xBA values:", df['xBA'].isna().sum())

# Save updated dataset
df.to_csv(output_path, index=False)
print(f"Updated dataset saved to {output_path}")

# Compute average xBA by Batter for batted balls
xba_summary = df[df['xBA'].notna()].groupby('Batter')['xBA'].mean().reset_index()
xba_summary = xba_summary.sort_values('xBA', ascending=False)
xba_summary.columns = ['Batter', 'Average_xBA']

# Filter batters with minimum plate appearances (optional)
min_pa = 10
pa_counts = df[df['xBA'].notna()]['Batter'].value_counts()
xba_summary = xba_summary[xba_summary['Batter'].isin(pa_counts[pa_counts >= min_pa].index)]

# Save xBA summary
xba_summary.to_csv(summary_path, index=False)
print(f"Batter xBA summary saved to {summary_path}")

# Display top 10 batters
print("\nTop 10 Batters by Average xBA:")
print(xba_summary.head(10))

# Plot top 20 batters' xBA
plt.figure(figsize=(20, 8))
sns.barplot(data=xba_summary.head(20), x='Average_xBA', y='Batter')
plt.title('Top 20 Batters by Average xBA')
plt.xlabel('Average xBA')
plt.ylabel('Batter')
plt.savefig(os.path.join(plot_dir, 'batter_xBA_bar_plot.png'))
plt.close()
print(f"Batter xBA bar plot saved to {os.path.join(plot_dir, 'batter_xBA_bar_plot.png')}")

Error: CornBelters/Data/BeesBelters5-28.csv not found.


NameError: name 'df' is not defined