In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import joblib
import warnings
warnings.filterwarnings('ignore')

In [29]:
df = pd.read_csv('final_baseball.csv')

# Create pitch name mapping
pitch_name_mapping = df[['pitch_type', 'pitch_name']].drop_duplicates().set_index('pitch_type')['pitch_name'].to_dict()

# Feature Engineering - Add batter weaknesses
# Calculate league averages FIRST
league_whiff = df['is_whiff'].mean()
league_xba = (df['launch_speed_angle'] > 0).mean()

# Calculate batter stats with fallback
batter_stats = df.groupby(['batter', 'pitch_type']).agg(
    calculated_whiff=('is_whiff', 'mean'),
    calculated_xba=('launch_speed_angle', lambda x: (x > 0).mean())
).reset_index()

# Merge with explicit left join
df = df.merge(
    batter_stats,
    on=['batter', 'pitch_type'],
    how='left',
    suffixes=('', '_calculated')
)

# Fill missing values with league averages
df['batter_whiff_rate'] = df['calculated_whiff'].fillna(league_whiff)
df['batter_xba'] = df['calculated_xba'].fillna(league_xba)

# Cleanup temporary columns
df = df.drop(columns=['calculated_whiff', 'calculated_xba'])

# Filter to pitchers with sufficient data
pitch_counts = df.groupby(['pitcher', 'pitch_type']).size().reset_index(name='count')
valid_pitchers = pitch_counts[pitch_counts['count'] >= 100]
df_train = pd.merge(df, valid_pitchers, on=['pitcher', 'pitch_type'])

In [30]:
# Define features
numerical_features = [
    'strikes', 'balls', 'release_speed', 'pfx_x', 'pfx_z', 
    'VAA', 'HAA', 'batter_whiff_rate', 'batter_xba'
]
categorical_features = ['stand', 'p_throws', 'zone_category']

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [31]:
# Verify columns exist
assert 'batter_whiff_rate' in df.columns, "Missing batter_whiff_rate column"
assert 'batter_xba' in df.columns, "Missing batter_xba column"
print("Available columns:", df.columns.tolist())

# Train model
le = LabelEncoder()
df_train['pitch_type_encoded'] = le.fit_transform(df_train['pitch_type'])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        num_class=len(le.classes_),
        max_depth=5,
        learning_rate=0.1,
        n_estimators=100
    ))
])

model.fit(
    df_train[numerical_features + categorical_features],
    df_train['pitch_type_encoded']
)

# Save artifacts
joblib.dump(model, 'pitch_recommender.pkl')
joblib.dump(le, 'label_encoder.pkl')

Available columns: ['game_date', 'game_type', 'pitcher', 'pitcher_name', 'batter', 'batter_name', 'pitch_type', 'pitch_name', 'events', 'description', 'stand', 'p_throws', 'release_speed', 'release_spin_rate', 'effective_speed', 'release_pos_x', 'release_pos_y', 'release_pos_z', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'zone', 'type', 'hit_location', 'bb_type', 'balls', 'strikes', 'hc_x', 'hc_y', 'launch_speed', 'launch_angle', 'launch_speed_angle', 'arm_angle', 'VAA', 'HAA', 'spin_axis', 'spin_efficiency', 'Vertical_Break', 'Horizontal_Break', 'is_whiff', 'is_strike', 'zone_category', 'pitcher_whiff_rate', 'batter_whiff_rate', 'league_whiff_rate', 'batter_xba']


['label_encoder.pkl']

In [32]:
# Helper functions
def get_pitcher_arsenal(pitcher_id):
    """Get pitches the pitcher actually throws (>5% usage)"""
    arsenal = df[df['pitcher'] == pitcher_id].groupby('pitch_type').size()
    valid_pitches = arsenal[arsenal / arsenal.sum() > 0.05].index.tolist()
    if not valid_pitches:
        return df['pitch_type'].unique().tolist()  # Fallback to all pitches
    return valid_pitches

def get_batter_weaknesses(batter_id):
    """Get batter weaknesses with fallback to league averages"""
    batter_data = df[df['batter'] == batter_id]
    if batter_data.empty:
        return df.groupby('pitch_type').agg(
            batter_whiff_rate=('is_whiff', 'mean'),  # CHANGED FROM whiff_rate
            batter_xba=('launch_speed_angle', lambda x: (x > 0).mean())  # CHANGED FROM xba
        ).reset_index()
    return batter_data.groupby('pitch_type').agg(
        batter_whiff_rate=('is_whiff', 'mean'),  # CHANGED FROM whiff_rate
        batter_xba=('launch_speed_angle', lambda x: (x > 0).mean())  # CHANGED FROM xba
    ).reset_index()

In [33]:
# Recommendation engine
def get_pitch_recommendations(pitcher_id, batter_id, context):
    # Load artifacts
    model = joblib.load('pitch_recommender.pkl')
    le = joblib.load('label_encoder.pkl')
    
    # Get pitcher arsenal
    arsenal = get_pitcher_arsenal(pitcher_id)
    pitcher_data = df[df['pitcher'] == pitcher_id]
    
    # Get batter weaknesses (now using proper columns)
    batter_stats = get_batter_weaknesses(batter_id)
    
    recommendations = []
    for pitch_type in arsenal:
        # Get pitcher stats - use PITCHER'S whiff rate
        pitch_stats = pitcher_data[pitcher_data['pitch_type'] == pitch_type].agg({
            'pitcher_whiff_rate': 'mean',  # From your existing column
            'zone_category': lambda x: x.mode()[0],
            'release_speed': 'median',
            'pfx_x': 'median',
            'pfx_z': 'median',
            'VAA': 'median',
            'HAA': 'median'
        })
        
        # Get batter weakness (now using merged columns)
        bw = batter_stats[batter_stats['pitch_type'] == pitch_type]
        batter_whiff = bw['batter_whiff_rate'].values[0] if not bw.empty else df['batter_whiff_rate'].median()
        batter_xba = bw['batter_xba'].values[0] if not bw.empty else df['batter_xba'].median()
        
        # Prepare input data
        input_data = [
            context['strikes'],
            context['balls'],
            pitch_stats['release_speed'],
            pitch_stats['pfx_x'],
            pitch_stats['pfx_z'],
            pitch_stats['VAA'],
            pitch_stats['HAA'],
            batter_whiff,  # Now using correct column
            batter_xba,    # Now using correct column
            context['stand'],
            context['p_throws'],
            pitch_stats['zone_category']
        ]
        
        # Get probability
        input_df = pd.DataFrame([input_data], columns=numerical_features + categorical_features)
        prob = model.predict_proba(input_df)[0][le.transform([pitch_type])[0]]
        
        # Hybrid score (60% pitcher, 40% batter weakness)
        final_score = (prob * 0.6) + ((1 - batter_xba) * 0.4)
        
        recommendations.append({
            'type': pitch_type,
            'name': pitch_name_mapping[pitch_type],
            'zone': pitch_stats['zone_category'],
            'score': final_score,
            'confidence': 'High' if final_score > 0.7 else 'Medium',
            'rationale': (
                f"Pitcher: {prob:.1%} | "
                f"Batter: {batter_whiff:.1%} whiff/{batter_xba:.3f} xBA"
            )
        })
    
    return sorted(recommendations, key=lambda x: -x['score'])[:4]


In [34]:
# Format output
def format_recommendations(recommendations, top_n=4):
    print("Top Recommendations:")
    print(f"{'Pitch':<20} | {'Zone':<12} | {'Score':<6} | {'Confidence':<10} | Rationale")
    print("-" * 90)
    
    for rec in recommendations[:top_n]:
        print(f"{rec['type'] + ' (' + rec['name'] + ')':<20} | "
              f"{rec['zone']:<12} | "
              f"{rec['score']:.2f} | "
              f"{rec['confidence']:<10} | "
              f"{rec['rationale']}")

In [37]:
# Example context
game_context = {
    'strikes': 1,
    'balls': 3,
    'stand': 'L',
    'p_throws': 'R'
}

# Get recommendation
recommendations = get_pitch_recommendations(
    pitcher_id=669022,
    batter_id=663728,
    context=game_context
)

# Print results
format_recommendations(recommendations)

Top Recommendations:
Pitch                | Zone         | Score  | Confidence | Rationale
------------------------------------------------------------------------------------------
SL (Slider)          | low_away     | 0.84 | High       | Pitcher: 81.8% | Batter: 18.9% whiff/0.122 xBA
FF (4-Seam Fastball) | high_middle  | 0.84 | High       | Pitcher: 82.2% | Batter: 11.5% whiff/0.144 xBA
CU (Curveball)       | low_middle   | 0.67 | Medium     | Pitcher: 50.9% | Batter: 18.8% whiff/0.099 xBA
CH (Changeup)        | low_middle   | 0.33 | Medium     | Pitcher: 0.9% | Batter: 19.0% whiff/0.183 xBA
