## Begin with loading in cleaned batting and pitching data for each team.
Mount drive and use the correct file path if it is stored else where. Otherwise, straight up uploading all the data in the data folder to colab content should work smoothly with this.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 1. Loading Data

In [18]:
import pandas as pd
import numpy as np

cubs_bat_df    = pd.read_csv('/content/cubs_standard_batting_clean.csv')
ws_bat_df      = pd.read_csv('/content/whitesox_standard_batting_clean.csv')
cubs_pitch_df  = pd.read_csv('/content/cubs_standard_pitching_clean.csv')
ws_pitch_df    = pd.read_csv('/content/whitesox_standard_pitching_clean.csv')



### 2. Calculate Probability

In [19]:
def bat_prob(df):
    df = df.copy()
    df['1B'] = df['H'] - df[['2B','3B','HR']].sum(axis=1)
    df['Out'] = df['PA'] - df[['HR','3B','2B','1B','BB','HBP','SO']].sum(axis=1)
    events = ['HR','3B','2B','1B','BB','HBP','SO','Out']
    for e in events:
        df[f'p_{e}'] = df[e] / df['PA']
    return df.set_index('Player')[[f'p_{e}' for e in events]]

def pitch_prob(df):
    df = df.copy()
    df['Out'] = df['BF'] - df[['HR','BB','HBP','SO']].sum(axis=1)
    events = ['HR','BB','HBP','SO','Out']
    for e in events:
        df[f'p_{e}'] = df[e] / df['BF']
    return df.set_index('Player')[[f'p_{e}' for e in events]]

cubs_bat_probs   = bat_prob(cubs_bat_df)
ws_bat_probs     = bat_prob(ws_bat_df)
cubs_pitch_probs = pitch_prob(cubs_pitch_df)
ws_pitch_probs   = pitch_prob(ws_pitch_df)

### 3. Lineup

The current setup does not consider player's position. It only pulls the top 9 players with the highest plate-appearance.

In [20]:
cubs_lineup = cubs_bat_df.sort_values('PA', ascending=False)['Player'].iloc[:9].tolist()
ws_lineup   = ws_bat_df.sort_values('PA', ascending=False)['Player'].iloc[:9].tolist()
ws_starter   = ws_pitch_df['Player'].iloc[0]
cubs_starter = cubs_pitch_df['Player'].iloc[0]

### 4. Functions

In [21]:
# One Plate Appearance
#
def simulate_pa(b_probs, p_probs=None, weight=0.5):
    events = [col[2:] for col in b_probs.index]
    b_vals = b_probs.values
    p_vals = np.array([p_probs.get(f'p_{e}', 0) if p_probs is not None else 0 for e in events])
    blended = (1 - weight) * b_vals + weight * p_vals
    blended /= blended.sum()
    return np.random.choice(events, p=blended)

# half-inning
# turning repeated plate appearances into runs, outs, and base-state progress for one side of an inning.
def simulate_half(lineup, pitcher_probs, batter_probs):
    outs, runs = 0, 0
    bases = [False, False, False]
    idx = 0
    n = len(lineup)
    while outs < 3:
        batter = lineup[idx]
        result = simulate_pa(batter_probs.loc[batter], pitcher_probs, weight=0.5)
        if result in ('Out', 'SO'):
            outs += 1
        elif result in ('BB', 'HBP'):
            if all(bases): runs += 1
            bases = [True] + bases[:2]
        elif result == 'HR':
            runs += sum(bases) + 1
            bases = [False, False, False]
        else:  # '1B','2B','3B'
            advance = {'1B':1,'2B':2,'3B':3}[result]
            for _ in range(advance):
                if bases[2]: runs += 1
                bases = [True] + bases[:2]
        idx = (idx + 1) % n
    return runs

# full game simulation
# top half with Cubs bat, bottom half with White Sox bat
def simulate_game(ws_lineup, cubs_lineup,
                  ws_pitcher, cubs_pitcher,
                  ws_pitch_probs, cubs_pitch_probs,
                  cubs_bat_probs, ws_bat_probs):
    ws_runs = cubs_runs = 0
    for _ in range(9):
        cubs_runs += simulate_half(cubs_lineup, ws_pitch_probs.loc[ws_pitcher], cubs_bat_probs)
        ws_runs   += simulate_half(ws_lineup, cubs_pitch_probs.loc[cubs_pitcher], ws_bat_probs)
    return ws_runs, cubs_runs

# Monte Carlo Loop
# calculates the probability of White Sox winning, estimates should converge to true probs by LLN.
def monte_carlo(n_sims, **kwargs):
    ws_wins = 0
    for _ in range(n_sims):
        ws_score, cubs_score = simulate_game(**kwargs)
        if ws_score > cubs_score:
            ws_wins += 1
    return ws_wins / n_sims

# repeat for 10000 times
win_prob = monte_carlo(
    n_sims=10000,
    ws_lineup=ws_lineup,
    cubs_lineup=cubs_lineup,
    ws_pitcher=ws_starter,
    cubs_pitcher=cubs_starter,
    ws_pitch_probs=ws_pitch_probs,
    cubs_pitch_probs=cubs_pitch_probs,
    cubs_bat_probs=cubs_bat_probs,
    ws_bat_probs=ws_bat_probs
)
print(f"White Sox win probability ~ {win_prob:.3f}")

White Sox win probability ~ 0.306
