In [2]:
# ...existing code...
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.calibration import calibration_curve
import os
from xgboost import XGBRegressor  # Change to regressor



In [None]:


dtypes = {
    'PlayResult': 'object',
    'TaggedHitType': 'object',
    'ExitSpeed': 'float64',
    'Angle': 'float64',
    'Direction': 'float64',
    'HitSpinRate': 'float64',
    'Distance': 'float64',
    'Bearing': 'float64',
    'HangTime': 'float64'
}
# ...existing code...

data = pd.read_csv('../Data/2025.csv', dtype=dtypes, low_memory=False)
df = data[['PlayResult', 'TaggedHitType', 'ExitSpeed', 'Angle']]

plot_dir = 'xSLG'
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)
# Clean PlayResult: Map outcomes to slug values
slug_map = {'Out': 0, 'Single': 1, 'Double': 2, 'Triple': 3, 'HomeRun': 4}
df = df[df['PlayResult'].isin(slug_map.keys())]
df['PlayResult'] = df['PlayResult'].map(slug_map)

# ...existing code...
# Convert PlayResult to numeric to ensure no strings remain
df['PlayResult'] = pd.to_numeric(df['PlayResult'], errors='coerce')
print("Missing PlayResult after encoding:", df['PlayResult'].isna().sum())
df = df.dropna(subset=['PlayResult'])
df['PlayResult'] = df['PlayResult'].astype('int64')


dgb = df[df['TaggedHitType'] == 'GroundBall'].dropna().reset_index(drop=True)
dpu = df[df['TaggedHitType'] == 'Popup'].dropna().reset_index(drop=True)
dld = df[df['TaggedHitType'] == 'LineDrive'].dropna().reset_index(drop=True)
dfb = df[df['TaggedHitType'] == 'FlyBall'].dropna().reset_index(drop=True)

# Check shapes and PlayResult distribution
print("GroundBall shape:", dgb.shape, "\nPlayResult counts:\n", dgb['PlayResult'].value_counts())
print("Popup shape:", dpu.shape, "\nPlayResult counts:\n", dpu['PlayResult'].value_counts())
print("LineDrive shape:", dld.shape, "\nPlayResult counts:\n", dld['PlayResult'].value_counts())
print("FlyBall shape:", dfb.shape, "\nPlayResult counts:\n", dfb['PlayResult'].value_counts())
print("Columns:", df.columns.tolist())
print("Data types:\n", df.dtypes)

# Define features for modeling (excluding non-numerical TaggedHitType)
features = ['ExitSpeed', 'Angle']

# Initialize dictionaries to store models, scalers, and test results
models = {}
scalers = {}
test_results = {}


# Train and evaluate models for each hit type
for df_hit, hit_type in [(dgb, 'GroundBall'), (dpu, 'Popup'), (dld, 'LineDrive'), (dfb, 'FlyBall')]:
    print(f"\nTraining model for {hit_type}...")
    X = df_hit[features]
    y = df_hit['PlayResult']
    
    if len(df_hit) < 10:
        print(f"Warning: {hit_type} has too few samples: {len(df_hit)}")
        continue
    if len(y.unique()) < 2:
        print(f"Warning: {hit_type} has only one class: {y.unique()}")
        continue
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"{hit_type} test set size:", len(y_test))
    print(f"{hit_type} test set classes:", y_test.unique())
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train XGBoost Regressor
    xgb_model = XGBRegressor(random_state=42)
    xgb_model.fit(X_train_scaled, y_train)
    
    # Predict xSLG
    y_pred = xgb_model.predict(X_test_scaled)
    
    # Evaluate regression performance
    mse = np.mean((y_test - y_pred) ** 2)
    print(f"{hit_type} MSE:", mse)
    
    models[hit_type] = xgb_model
    scalers[hit_type] = scaler
    test_results[hit_type] = {'y_test': y_test, 'y_pred': y_pred}
    
    # Save feature importance plot
    plt.figure(figsize=(8, 6))
    pd.Series(xgb_model.feature_importances_, index=features).sort_values().plot(kind='barh')
    plt.title(f'Feature Importance for {hit_type}')
    plt.xlabel('Importance')
    plt.savefig(os.path.join(plot_dir, f'feature_importance_{hit_type}_xSLG.png'))
    plt.close()

# Function to predict xSLG for a single row
def predict_xslg(row, models, scalers):
    hit_type = row['TaggedHitType']
    if hit_type not in models:
        return np.nan
    features_values = row[features].values.reshape(1, -1)
    features_scaled = scalers[hit_type].transform(features_values)
    return models[hit_type].predict(features_scaled)[0]

# Apply predictions
df['xSLG'] = df.apply(lambda row: predict_xslg(row, models, scalers), axis=1)

# Verify xSLG
print("\nxSLG Summary:")
print(df['xSLG'].describe())

# Save DataFrame with xSLG
df.to_csv('xSLG/2025_with_xSLG.csv', index=False)
print("DataFrame with xSLG saved to 2025_with_xSLG.csv")
# ...existing code...

# Save models and scalers
for hit_type, model in models.items():
    with open(f'xSLG/xgb_model_{hit_type}.pkl', 'wb') as f:
        pickle.dump(model, f)
    with open(f'xSLG/scaler_{hit_type}.pkl', 'wb') as f:
        pickle.dump(scalers[hit_type], f)
print("Models and scalers saved successfully.")

Missing PlayResult after encoding: 0
GroundBall shape: (667, 4) 
PlayResult counts:
 PlayResult
0    441
1    207
2     17
3      2
Name: count, dtype: int64
Popup shape: (149, 4) 
PlayResult counts:
 PlayResult
0    133
1     12
2      4
Name: count, dtype: int64
LineDrive shape: (381, 4) 
PlayResult counts:
 PlayResult
1    172
0    114
2     80
4      9
3      6
Name: count, dtype: int64
FlyBall shape: (429, 4) 
PlayResult counts:
 PlayResult
0    312
4     64
2     32
1     14
3      7
Name: count, dtype: int64
Columns: ['PlayResult', 'TaggedHitType', 'ExitSpeed', 'Angle']
Data types:
 PlayResult         int64
TaggedHitType     object
ExitSpeed        float64
Angle            float64
dtype: object

Training model for GroundBall...
GroundBall test set size: 134
GroundBall test set classes: [0 2 1]
GroundBall MSE: 0.35201207915059296

Training model for Popup...
Popup test set size: 30
Popup test set classes: [0 2 1]
Popup MSE: 0.17780150952600499

Training model for LineDrive...
Lin




xSLG Summary:
count    1731.000000
mean        0.662878
std         0.979658
min        -0.777304
25%         0.002238
50%         0.069313
75%         0.999694
max         5.547213
Name: xSLG, dtype: float64


OSError: Cannot save file into a non-existent directory: 'xBA'