In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import glob


In [5]:
cornbelters_files = glob.glob("../KCLData/*.csv")
other_files = glob.glob("../CornBeltersData/*.csv")

all_files = cornbelters_files + other_files

# Read and concatenate all CSVs into one DataFrame
stuff_plus = [pd.read_csv(f) for f in all_files]
df = pd.concat(stuff_plus, ignore_index=True)

all_files = cornbelters_files 

# Read and concatenate all CSVs into one DataFrame
stuff_plus = [pd.read_csv(f) for f in all_files]
df = pd.concat(stuff_plus, ignore_index=True)

In [6]:
df.columns

Index(['PitchNo', 'Date', 'Time', 'PAofInning', 'PitchofPA', 'Pitcher',
       'PitcherId', 'PitcherThrows', 'PitcherTeam', 'Batter',
       ...
       'yt_ReleaseAccuracy', 'yt_ZoneAccuracy', 'yt_SeamLat', 'yt_SeamLong',
       'yt_ReleaseDistance', 'Catcher', 'CatcherId', 'CatcherTeam',
       'yt_AeroModel', 'AutoPitchType'],
      dtype='object', length=126)

In [13]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

# Step 1: Train the logistic regression model (provided code)
np.random.seed(42)
n_games = 10000
game_states = pd.DataFrame({
    'inning': np.random.randint(1, 10, n_games),
    'top_bottom': np.random.choice([0, 1], n_games),  # 0=Top (away bats), 1=Bottom (home bats)
    'outs': np.random.randint(0, 3, n_games),
    'score_diff': np.random.randint(-10, 10, n_games),  # Batting team’s score minus opponent’s
    'runners': np.random.randint(0, 4, n_games),
})
# Simulate win outcomes
game_states['win'] = (
    0.5 + 0.1 * game_states['score_diff'] + 0.05 * game_states['runners']
    - 0.03 * game_states['inning'] - 0.02 * game_states['outs']
    + np.random.normal(0, 0.1, n_games)
).clip(0, 1)
game_states['win'] = (game_states['win'] > 0.5).astype(int)

# Train logistic regression
X = game_states[['inning', 'top_bottom', 'outs', 'score_diff', 'runners']]
y = game_states['win']
model = LogisticRegression(max_iter=10000)
model.fit(X, y)

In [None]:
import pandas as pd
import numpy as np

# Step 2: Define WP calculation function
def calculate_wp(model, inning, top_bottom, outs, score_diff, runners):
    state = np.array([[inning, top_bottom, outs, score_diff, runners]])
    return model.predict_proba(state)[0][1]  # Probability of batting team winning

def compute_wpa(df, wp_model):
    away_score = 0
    home_score = 0
    current_runners = 0
    wpa_data = []
    current_team1 = None
    current_team2 = None
    current_pa = None
    pa_start_idx = 0

    for i, row in df.iterrows():
        is_first_pitch_of_game = (i == 0)
        is_new_pa = (current_pa != row['PAofInning']) or is_first_pitch_of_game
        

        if is_new_pa:
            if not is_first_pitch_of_game and pa_start_idx < len(df):
                first_row = df.iloc[pa_start_idx]
                last_row = df.iloc[i - 1]

                inning = first_row['Inning']
                top_bottom = 0 if first_row['Top/Bottom'] == 'Top' else 1
                batting_team = 'away' if top_bottom == 0 else 'home'
                outs = first_row['Outs']
                
                # Score difference from batting team's perspective
                score_diff = (away_score - home_score) if batting_team == 'away' else (home_score - away_score)

                wp_before = calculate_wp(wp_model, inning, top_bottom, outs, score_diff, current_runners)
                current_team1 = first_row['BatterTeam']
                current_team2 = first_row['PitcherTeam']
                play_result = last_row['PlayResult']
                runs_scored = last_row['RunsScored']
                outs_on_play = last_row['OutsOnPlay']

                # Update scores
                if batting_team == 'away':
                    away_score += runs_scored
                else:
                    home_score += runs_scored

                # Update score_diff after scoring
                score_diff = (away_score - home_score) if batting_team == 'away' else (home_score - away_score)

                # Update runners (basic approximation)
                if play_result in ['Single', 'Walk', 'HitByPitch']:
                    current_runners += 1
                elif play_result == 'Double':
                    current_runners += 1
                elif play_result == 'Triple':
                    current_runners += 1
                elif play_result == 'HomeRun':
                    current_runners = 0
                elif play_result in ['Out', 'Strikeout', 'StrikeoutSwinging', 'StrikeoutLooking', 'FieldersChoice']:
                    if outs_on_play > 1:
                        current_runners = max(0, current_runners - (outs_on_play - 1))
                elif play_result in ['Sacrifice', 'SacrificeFly', 'SacrificeBunt']:
                    if current_runners > 0:
                        current_runners = max(0, current_runners - 1)
                elif play_result == 'Error':
                    current_runners += 1
                elif play_result == 'CaughtStealing':
                    current_runners = max(0, current_runners - 1)
                elif play_result in ['WildPitch', 'PassedBall', 'StolenBase']:
                    current_runners = min(3, current_runners + 1)

                # Check for walk-off scenario
                if (inning >= 9 and top_bottom == 1 and score_diff > 0 and
                    play_result in ['HomeRun', 'Single', 'Double', 'Triple', 'Walk', 'HitByPitch', 'SacrificeFly', 'Sacrifice']):
                    wp_after = 1.0
                else:
                    # Use new outs for next WP estimate
                    outs = last_row['Outs']
                    wp_after = calculate_wp(wp_model, inning, top_bottom, outs, score_diff, current_runners)

                wpa = wp_after - wp_before

                wpa_data.append({
                    'PitchNo': last_row['PitchNo'],
                    'Date': last_row['Date'],
                    'GameID': last_row['GameID'],
                    'Batter': last_row['Batter'],
                    'Pitcher': last_row['Pitcher'],
                    'WPA': wpa,
                    'WP_before': wp_before,
                    'WP_after': wp_after,
                    'Inning': inning,
                    'Outs': outs,
                    'Runners': current_runners,
                    'Score_Diff': score_diff,
                    'PlayResult': play_result,
                    'Batting_Team': batting_team,
                    'Current Team1': current_team1,
                    'Current Team2': current_team2
                })

            # Set new PA
            current_pa = row['PAofInning']
            pa_start_idx = i

    # Process final PA
    if pa_start_idx < len(df):
        first_row = df.iloc[pa_start_idx]
        last_row = df.iloc[-1]

        inning = first_row['Inning']
        top_bottom = 0 if first_row['Top/Bottom'] == 'Top' else 1
        batting_team = 'away' if top_bottom == 0 else 'home'
        outs = first_row['Outs']
        
        score_diff = (away_score - home_score) if batting_team == 'away' else (home_score - away_score)

        wp_before = calculate_wp(wp_model, inning, top_bottom, outs, score_diff, current_runners)

        play_result = last_row['PlayResult']
        runs_scored = last_row['RunsScored']
        outs_on_play = last_row['OutsOnPlay']
        current_team1 = first_row['BatterTeam']
        current_team2 = first_row['PitcherTeam']
        if batting_team == 'away':
            away_score += runs_scored
        else:
            home_score += runs_scored

        score_diff = (away_score - home_score) if batting_team == 'away' else (home_score - away_score)

        if play_result in ['Single', 'Walk', 'HitByPitch']:
            current_runners += 1
        elif play_result == 'Double':
            if runs_scored > 0:
                current_runners = max(0, current_runners - 1)
            current_runners += 1
        elif play_result == 'Triple':
            current_runners = 1
        elif play_result == 'HomeRun':
            current_runners = 0
        elif play_result in ['Out', 'Strikeout', 'StrikeoutSwinging', 'StrikeoutLooking', 'FieldersChoice']:
            if outs_on_play > 1:
                current_runners = max(0, current_runners - (outs_on_play - 1))
        elif play_result in ['Sacrifice', 'SacrificeFly', 'SacrificeBunt']:
            if current_runners > 0:
                current_runners = max(0, current_runners - 1)
        elif play_result == 'Error':
            current_runners += 1
        elif play_result == 'CaughtStealing':
            current_runners = max(0, current_runners - 1)
        elif play_result in ['WildPitch', 'PassedBall', 'StolenBase']:
            current_runners = min(3, current_runners + 1)

        if (inning >= 9 and top_bottom == 1 and score_diff > 0 and
            play_result in ['HomeRun', 'Single', 'Double', 'Triple', 'Walk', 'HitByPitch', 'SacrificeFly', 'Sacrifice']):
            wp_after = 1.0
        else:
            outs = last_row['Outs']
            wp_after = calculate_wp(wp_model, inning, top_bottom, outs, score_diff, current_runners)

        wpa = wp_after - wp_before

        wpa_data.append({
            'PitchNo': last_row['PitchNo'],
            'Date': last_row['Date'],
            'GameID': last_row['GameID'],
            'Batter': last_row['Batter'],
            'Pitcher': last_row['Pitcher'],
            'WPA': wpa,
            'WP_before': wp_before,
            'WP_after': wp_after,
            'Inning': inning,
            'Outs': outs,
            'Runners': current_runners,
            'Score_Diff': score_diff,
            'PlayResult': play_result,
            'Batting_Team': batting_team,
            'Current Team1': current_team1,
            'Current Team2': current_team2
        })

    return pd.DataFrame(wpa_data)

# Step 4: Load and process your dataset
# Replace with your actual file path
# Step 5: Clean data and compute WPA
# Drop rows with NaN in critical columns
df = df.dropna(subset=['PitchNo', 'Date', 'GameID', 'PAofInning', 'Pitcher', 'Batter', 
                       'Inning', 'Top/Bottom', 'Outs', 'PlayResult', 'OutsOnPlay', 'RunsScored'])

# Sort by PitchNo to ensure chronological order
df = df.sort_values('PitchNo')

# Handle multiple games using GameID
wpa_results = []
for game_id, game_df in df.groupby('GameID'):
    print(f"Processing game: {game_id}")
    game_df = game_df.reset_index(drop=True)  # Reset index for each game
    try:
        wpa_df = compute_wpa(game_df, model)
        wpa_df['GameID'] = game_id
        wpa_results.append(wpa_df)
    except Exception as e:
        print(f"Error processing game {game_id}: {e}")

# Combine and save results
if wpa_results:
    final_wpa_df = pd.concat(wpa_results, ignore_index=True)
    final_wpa_df.to_csv('wpa_results.csv', index=False)
    print(final_wpa_df[['PitchNo', 'Date', 'GameID', 'Batter', 'Pitcher', 'WPA', 
                        'WP_before', 'WP_after', 'PlayResult', 'Batting_Team']].head())
else:
    print("No WPA results generated.")

Processing game: 06c69396-5582-4f92-b0db-9567bae2f35c
Processing game: 0c6143d7-17cd-44cb-8742-2e9416f1cc29
Processing game: 12d685e0-ed41-4887-989e-bce93e07fe53




Processing game: 136e9a7b-d351-4a55-a13e-a25c1684ceb0
Processing game: 2228eb6c-c800-4ce9-8a84-561ca7c2cf2c
Processing game: 32d7c9ad-b180-4cdf-9dbb-d2b202911ebc
Processing game: 4a6ceeec-b7f7-469d-8e2a-bd0e4736f25b




Processing game: 5b65661a-d768-4cda-9c86-8fb477a9e97f
Processing game: 5e4402e6-e0d4-4f9e-8996-142e6a31fcba
Processing game: 6382e2ae-c12c-448b-afb8-e001b164724d
Processing game: 650e1a29-0b2c-4cd0-bb1b-126286579774
Processing game: 6b60b1f0-f900-4e83-a62b-51417f9156a4




Processing game: 6fb314cc-d72f-4379-9b76-3276a0161e62
Processing game: 7069efbf-ee7f-47d5-b28f-4b59e5ed92dc
Processing game: 83661acf-cb04-4f7a-913c-4c3f526cb8c1
Processing game: 8e6503c4-43dd-4ba3-9b22-05e3088d1e6f
Processing game: 94f39949-9b72-4aef-9be5-05d58a64e15b




Processing game: a7e2d5ec-5834-4283-8774-090d88d82cfa
Processing game: b6f56c4a-95bf-4870-8033-c04a1aa453a9
Processing game: bc919d6c-b91c-4e1d-9af2-89eed83c84af
Processing game: c65fe60e-26b1-4eae-86b5-dca04257ad83
Processing game: c91dcdf1-3333-4bb7-b651-94a6644e20ef




Processing game: d17febe0-201c-4034-8f55-ed2fb9efbf08
Processing game: d70f066d-1e55-4b57-ab7d-c9d77bf4f8d3
Processing game: dacdef0a-b909-42fc-a586-b4d4a7ac6822
Processing game: ef1cbb92-a212-4ecf-8c96-ccb1ba92a7a4
Processing game: effbe537-32d5-4c27-97c3-20c950b2c2cf




Processing game: f15ee0e1-420a-4465-8fae-aee41f015f95
Processing game: fc8fb489-dc74-45a7-bf0a-4bc453518bfd




Processing game: fcae5d30-7c3d-4ac4-b21d-0c15022aa8f7
Processing game: fd69b413-82e4-4a5a-beee-ce0c2aca9167
   PitchNo    Date                                GameID               Batter  \
0        1  6/8/25  06c69396-5582-4f92-b0db-9567bae2f35c  Sammy Descarpentrie   
1       13  6/8/25  06c69396-5582-4f92-b0db-9567bae2f35c   Cameron Steinbaugh   
2       18  6/8/25  06c69396-5582-4f92-b0db-9567bae2f35c           Will Vogel   
3       23  6/8/25  06c69396-5582-4f92-b0db-9567bae2f35c          Adan Nieves   
4       28  6/8/25  06c69396-5582-4f92-b0db-9567bae2f35c       Jackson Newton   

        Pitcher       WPA  WP_before  WP_after         PlayResult Batting_Team  
0   Logan Lynch  0.208989   0.301739  0.510728             Single         away  
1   Logan Lynch  0.419142   0.510728  0.929871               Walk         away  
2   Logan Lynch  0.064227   0.929871  0.994098               Walk         away  
3   Logan Lynch  0.000000   0.994098  0.994098  StrikeoutSwinging         away  


