In [None]:
# add_xBA.py: Add xBA column to BeesBelters5-28.csv for all hit types and list all batters by average xBA

# Import libraries
import pandas as pd
import numpy as np
import pickle
import os

# File paths
data_path = 'Data/2025.csv'
output_path = 'Data/2025wxBA.csv'
summary_path = 'Data/batter_xBA.csv'

# Load models and scalers for each hit type
hit_types = ['GroundBall', 'Popup', 'LineDrive', 'FlyBall']
models = {}
scalers = {}
for hit_type in hit_types:
    model_path = f'xgb_model_{hit_type}.pkl'
    scaler_path = f'scaler_{hit_type}.pkl'
    try:
        with open(model_path, 'rb') as f:
            models[hit_type] = pickle.load(f)
        with open(scaler_path, 'rb') as f:
            scalers[hit_type] = pickle.load(f)
    except FileNotFoundError as e:
        print(f"Error: {e}. Ensure xBA.py has been run to generate {model_path} and {scaler_path}.")
        exit(1)

# Load dataset
try:
    df = pd.read_csv(data_path, low_memory=False)
except FileNotFoundError:
    print(f"Error: {data_path} not found.")
    exit(1)

# Inspect dataset
print("Dataset columns:", df.columns.tolist())
print("Dataset shape:", df.shape)
print("Unique TaggedHitType values:", df['TaggedHitType'].unique())
print("Unique PlayResult values:", df['PlayResult'].unique())

# Ensure features are numeric
features = ['ExitSpeed', 'Angle', 'Direction', 'HitSpinRate', 'Distance', 'Bearing', 'HangTime']
for col in features:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Check for missing features
missing_cols = [col for col in features if col not in df.columns]
if missing_cols:
    print(f"Error: Missing columns {missing_cols} in dataset.")
    exit(1)

# Function to predict xBA for a row
def predict_xba(row, models, scalers, features):
    hit_type = row['TaggedHitType']
    if hit_type not in models or pd.isna(row[features]).any():
        return np.nan
    input_data = row[features].values.reshape(1, -1)
    input_scaled = scalers[hit_type].transform(input_data)
    return models[hit_type].predict_proba(input_scaled)[0, 1]

# Filter batted ball events
batted_balls = df['TaggedHitType'].isin(hit_types)

# Predict xBA for batted balls
print("Predicting xBA for batted balls...")
df['xBA'] = df[batted_balls].apply(lambda row: predict_xba(row, models, scalers, features), axis=1)

# Move xBA to the end
cols = [col for col in df.columns if col != 'xBA'] + ['xBA']
df = df[cols]

# Check xBA results
print("xBA column added. Summary:")
print(df['xBA'].describe())
print("Missing xBA values:", df['xBA'].isna().sum())

# Save updated dataset
df.to_csv(output_path, index=False)
print(f"Updated dataset saved to {output_path}")

# Compute average xBA by Batter for batted balls
xba_summary = df[df['xBA'].notna()].groupby('Batter')['xBA'].mean().reset_index()
xba_summary = xba_summary.sort_values('xBA', ascending=False)
xba_summary.columns = ['Batter', 'Average_xBA']

# Filter batters with minimum plate appearances
min_pa = 10
pa_counts = df[df['xBA'].notna()]['Batter'].value_counts()
xba_summary = xba_summary[xba_summary['Batter'].isin(pa_counts[pa_counts >= min_pa].index)]

# Save xBA summary
xba_summary.to_csv(summary_path, index=False)
print(f"Batter xBA summary saved to {summary_path}")

# Display all batters
print("\nAll Batters by Average xBA:")
print(xba_summary)

Dataset columns: ['PitchNo', 'Date', 'Time', 'PAofInning', 'PitchofPA', 'Pitcher', 'PitcherId', 'PitcherThrows', 'PitcherTeam', 'Batter', 'BatterId', 'BatterSide', 'BatterTeam', 'PitcherSet', 'Inning', 'Top/Bottom', 'Outs', 'Balls', 'Strikes', 'TaggedPitchType', 'AutoPitchType', 'PitchCall', 'KorBB', 'TaggedHitType', 'PlayResult', 'OutsOnPlay', 'RunsScored', 'Notes', 'RelSpeed', 'VertRelAngle', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 'Tilt', 'RelHeight', 'RelSide', 'Extension', 'VertBreak', 'InducedVertBreak', 'HorzBreak', 'PlateLocHeight', 'PlateLocSide', 'ZoneSpeed', 'VertApprAngle', 'HorzApprAngle', 'ZoneTime', 'ExitSpeed', 'Angle', 'Direction', 'HitSpinRate', 'PositionAt110X', 'PositionAt110Y', 'PositionAt110Z', 'Distance', 'LastTrackedDistance', 'Bearing', 'HangTime', 'pfxx', 'pfxz', 'x0', 'y0', 'z0', 'vx0', 'vy0', 'vz0', 'ax0', 'ay0', 'az0', 'HomeTeam', 'AwayTeam', 'Stadium', 'Level', 'League', 'GameID', 'PitchUUID', 'yt_RelSpeed', 'yt_RelHeight', 'yt_RelSide', 'yt_VertRelAngle',



In [11]:
df = pd.read_csv('Data/BeesBelters5-28.csv')

0    0.011098
1         NaN
2         NaN
3         NaN
4         NaN
Name: xBA, dtype: float64