# Game Preprocessing

This notebook prepares and cleans 2021 through 2025 MLB game data for modeling.


In [1]:
from functools import lru_cache
import pandas as pd
import numpy as np
from pathlib import Path 
from IPython.display import display, HTML
import sys
from pybaseball import playerid_reverse_lookup


In [2]:
# Add repo_root/src to PYTHONPATH
repo_root = Path.cwd()  # if your notebook is in the repo root
src_path = repo_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))


# TODO: Organize and Comment the Imports

In [3]:
from preprocessing.preprocessing_common import (
    add_game_id,
    merge_game_number_and_pitcher,
    trim_game_id_inplace,
    append_game_number_to_game_id,
    PA_ENDING_EVENTS,
    filter_plate_appearances,
    combine_pitching_batting_deltas,
)

from preprocessing.pitching_preprocessing import (
    add_starter_indicator_pitchlevel,
    add_starter_full_game_indicator,
    add_pitching_indicators,
    split_starter_bullpen,
    aggregate_pitching_game_lines,
    add_rolling_pitching_counts,
    add_rate_metrics_from_rolled_counts,
    combine_game_level_pitching_rolling_rates,
    make_pitching_delta_df,
    
    #Move these to correct space
    
    summarize_pitching_rates,
    impute_pitching_roll_rates_from_prev_season,
    combine_game_level_pitching_rolling_rates,
    carry_forward_bullpen_rolls_on_full_games
)

from preprocessing.batting_preprocessing import (
    add_batting_indicators,
    aggregate_team_game_batting,
    add_time_rolling_batting_sums,
    add_rolling_obp_iso,
    add_rolling_obp_iso_batch,
    combine_home_away_batting_rolls,
    make_batting_delta_df,
    
    # Move to proper place
    calculate_mean_obp_iso,
    fill_missing_rolling_from_prior_year,
    split_home_away_team_game, 
    combine_home_away_by_game
)


from preprocessing.field_preprocessing import (
    add_fielding_indicators,
    make_game_fielding_bip_counts,
    add_rolling_bip_features,
    calculate_mean_bip_out_rate,
    fill_missing_rolling_bip_out_rate_from_prior_year,
    make_game_level_fielding_out_rate_wide,
    make_fielding_out_rate_deltas
)

In [4]:
from preprocessing.validation.game import qc_missing_halves
from preprocessing.outcomes import make_game_outcomes_from_statcast_maxscore
from preprocessing.validation.inspect import inspect_game_timeline
from preprocessing.validation.pitching import (starter_complete_game_flags, 
                                               summarize_complete_games,
                                               validate_starter_lines_by_year)

from preprocessing.schema import drop_rolled_component_cols
from preprocessing.validation.common import missing_summary, assert_no_missing


## Reading in Data

Below, I read in season-level data from **2021–2025** using a cached helper function to avoid repeated disk reads and improve performance.

In [5]:
data_dir = Path("data/raw_season_data")
season_files = {y: data_dir / f"season_{y}.csv" for y in range(2021, 2026)}

read_kwargs = {
    # "usecols": [...],                 # select only needed columns
    # "parse_dates": [...],             # e.g., ["game_date"]
    # "dtype": {"batter_id": "int32"},  # downcast numerics where safe
    "engine": "pyarrow",                # faster & lower memory if available
    # "dtype_backend": "pyarrow",       # pandas 2.1+: keeps Arrow dtypes
}

@lru_cache(maxsize=None)
def load_season(year: int) -> pd.DataFrame:
    df = pd.read_csv(season_files[year], **read_kwargs)
    return df

In [6]:
season_2021 = load_season(2021)
season_2022 = load_season(2022)
season_2023 = load_season(2023)
season_2024 = load_season(2024)
season_2025 = load_season(2025)

display(HTML("<h4>Season 2021</h4>")); display(season_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(season_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(season_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(season_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(season_2025.head(5))

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
0,FF,2021-10-03,92.3,1.4,6.8,"Smith, Will",596019,519293,field_out,hit_into_play,...,,1.28,0.69,-0.69,47.4,,,,,
1,SL,2021-10-03,80.6,1.6,6.64,"Smith, Will",596019,519293,,foul,...,,2.99,-0.77,0.77,44.3,,,,,
2,CU,2021-10-03,75.5,1.46,6.88,"Smith, Will",596019,519293,,foul,...,,4.52,-0.65,0.65,51.7,,,,,
3,CU,2021-10-03,75.0,1.53,6.83,"Smith, Will",596019,519293,,ball,...,,4.74,-0.69,0.69,49.5,,,,,
4,FF,2021-10-03,91.2,1.49,6.66,"Smith, Will",607043,519293,field_out,hit_into_play,...,,1.49,0.63,0.63,44.0,,,,,


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
0,CH,2022-10-05,80.8,-0.76,6.61,"Baker, Bryan",624415,641329,field_out,hit_into_play,...,0.0,2.68,1.34,-1.34,59.9,,,,,
1,FF,2022-10-05,97.7,-0.58,6.6,"Baker, Bryan",643376,641329,strikeout,swinging_strike,...,0.0,0.81,0.17,0.17,53.6,,,,,
2,CH,2022-10-05,84.9,-0.55,6.58,"Baker, Bryan",643376,641329,,ball,...,0.0,2.34,1.22,1.22,58.4,,,,,
3,FF,2022-10-05,97.2,-0.42,6.6,"Baker, Bryan",643376,641329,,swinging_strike,...,0.0,0.68,0.13,0.13,57.2,,,,,
4,SL,2022-10-05,86.2,-0.55,6.64,"Baker, Bryan",643376,641329,,called_strike,...,0.0,3.04,-0.63,-0.63,58.8,,,,,


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
0,CH,2023-10-01,89.0,-2.8,5.59,"Robertson, Nick",677008,687798,field_out,hit_into_play,...,,2.55,1.53,-1.53,31.7,1.676715,-1.896554,41.830979,30.714944,26.41202
1,FF,2023-10-01,96.9,-2.4,5.9,"Robertson, Nick",677008,687798,,foul,...,,1.09,0.76,-0.76,47.4,8.715532,3.692542,40.551342,33.656454,26.020583
2,CH,2023-10-01,90.0,-2.93,5.56,"Robertson, Nick",677008,687798,,ball,...,,2.47,1.65,-1.65,30.3,,,,,
3,ST,2023-10-01,82.2,-3.09,5.55,"Robertson, Nick",677008,687798,,ball,...,,3.14,-1.43,1.43,28.9,,,,,
4,CH,2023-10-01,89.2,-2.87,5.58,"Robertson, Nick",677008,687798,,swinging_strike,...,,2.57,1.49,-1.49,34.3,20.169759,-7.584644,37.675911,44.236969,36.187039


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
0,FF,2024-09-30,97.4,-2.1,4.88,"Díaz, Edwin",518595,621242,field_out,hit_into_play,...,1.0,1.4,0.96,0.96,17.6,6.149605,12.090516,22.1604,45.805662,22.048373
1,SL,2024-09-30,90.7,-2.14,5.06,"Díaz, Edwin",518595,621242,,ball,...,1.0,2.14,-0.2,-0.2,23.1,,,,,
2,SL,2024-09-30,91.1,-2.07,5.14,"Díaz, Edwin",518595,621242,,swinging_strike,...,1.0,2.37,-0.12,-0.12,22.4,23.541699,-27.093819,34.778701,45.227965,45.368412
3,SL,2024-09-30,91.3,-2.05,5.07,"Díaz, Edwin",518595,621242,,ball,...,1.0,2.09,-0.21,-0.21,22.4,,,,,
4,SL,2024-09-30,89.1,-2.13,5.15,"Díaz, Edwin",518595,621242,,swinging_strike,...,1.0,2.2,-0.17,-0.17,20.2,23.112048,-30.629825,33.038132,53.011806,51.686541


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
0,FF,2025-09-28,95.7,-2.15,5.21,"Weissert, Greg",678009,669711,field_out,hit_into_play,...,2.0,1.56,0.71,-0.71,20.9,5.991833,-1.319512,28.782516,41.559201,30.599805
1,FF,2025-09-28,95.1,-1.91,5.1,"Weissert, Greg",668670,669711,strikeout,called_strike,...,9.0,1.59,0.93,0.93,20.5,,,,,
2,FF,2025-09-28,95.4,-1.99,5.22,"Weissert, Greg",668670,669711,,foul,...,9.0,1.36,0.85,0.85,22.9,2.871131,31.805044,22.266527,37.478847,15.582717
3,SL,2025-09-28,84.8,-2.33,4.72,"Weissert, Greg",668670,669711,,swinging_strike,...,9.0,2.55,-0.32,-0.32,12.3,13.78541,4.08139,32.414181,38.011685,27.083341
4,SL,2025-09-28,85.3,-2.26,4.85,"Weissert, Greg",668670,669711,,called_strike,...,9.0,2.71,-0.52,-0.52,15.8,,,,,


## Examining Data

Below, I examine dataset dimensions, column consistency across seasons, and the columns selected from a representative dataframe.

### Dimensions

In [7]:
for y in range(2021, 2026):
    r, c = globals()[f"season_{y}"].shape
    print(f"season_{y}: {r:,} rows × {c} cols")

season_2021: 712,320 rows × 118 cols
season_2022: 710,210 rows × 118 cols
season_2023: 720,684 rows × 118 cols
season_2024: 732,481 rows × 118 cols
season_2025: 742,080 rows × 118 cols


### Column Consistency Check Across Seasons

- Defines the years to check (2021–2025).
- Uses `season_2025` as the reference column schema.
- Compares each `season_YYYY` dataset to the reference.
- Prints **OK** if column names *and order* match exactly, otherwise **DIFF**.
- Tracks whether all datasets match.
- Outputs a final `True/False` summary indicating full column consistency.

**Note:** This is a strict check — column order must also match.


In [8]:
years = range(2021, 2026)
ref = globals()["season_2025"].columns  # use 2025 as reference

print(f"[REFERENCE] season_2025 ({len(ref)} columns)\n")

all_match = True
for y in years:
    cols = globals()[f"season_{y}"].columns
    ok = cols.equals(ref)
    print(f"season_{y}: {'OK' if ok else 'DIFF'}")
    all_match &= ok

print("\nALL MATCH (names + order):", all_match)

[REFERENCE] season_2025 (118 columns)

season_2021: OK
season_2022: OK
season_2023: OK
season_2024: OK
season_2025: OK

ALL MATCH (names + order): True


### Printing Columns 

Since all columns match, I will print the `season_2025` columns.

In [9]:
season_2025.columns.tolist()

['pitch_type',
 'game_date',
 'release_speed',
 'release_pos_x',
 'release_pos_z',
 'player_name',
 'batter',
 'pitcher',
 'events',
 'description',
 'spin_dir',
 'spin_rate_deprecated',
 'break_angle_deprecated',
 'break_length_deprecated',
 'zone',
 'des',
 'game_type',
 'stand',
 'p_throws',
 'home_team',
 'away_team',
 'type',
 'hit_location',
 'bb_type',
 'balls',
 'strikes',
 'game_year',
 'pfx_x',
 'pfx_z',
 'plate_x',
 'plate_z',
 'on_3b',
 'on_2b',
 'on_1b',
 'outs_when_up',
 'inning',
 'inning_topbot',
 'hc_x',
 'hc_y',
 'tfs_deprecated',
 'tfs_zulu_deprecated',
 'umpire',
 'sv_id',
 'vx0',
 'vy0',
 'vz0',
 'ax',
 'ay',
 'az',
 'sz_top',
 'sz_bot',
 'hit_distance_sc',
 'launch_speed',
 'launch_angle',
 'effective_speed',
 'release_spin_rate',
 'release_extension',
 'game_pk',
 'fielder_2',
 'fielder_3',
 'fielder_4',
 'fielder_5',
 'fielder_6',
 'fielder_7',
 'fielder_8',
 'fielder_9',
 'release_pos_y',
 'estimated_ba_using_speedangle',
 'estimated_woba_using_speedangle',
 'w

### Name Changes


In [10]:
rename_map = {
    "game_pk": "game_id",
    "player_name": "pitcher_name",
    "batter": "batter_id",
    "pitcher": "pitcher_id"
}

for year in range(2021, 2026):
    df_name = f"season_{year}"
    globals()[df_name] = globals()[df_name].rename(columns=rename_map)


## Regular Season Games

Below, I filter each season dataset (2021–2025) to retain **regular season** games only (`game_type == "R"`).

In [11]:
for y in range(2021, 2026):
    name = f"season_{y}"
    globals()[name] = globals()[name].loc[globals()[name]["game_type"] == "R"].copy()

## Quality Check

Examine tied games, missing data, etc.

In [12]:
# ---- create full copies of season data ----
for y in range(2021, 2026):
    globals()[f"season_{y}_full"] = globals()[f"season_{y}"].copy()

    print(f"Created season_{y}_full with {len(globals()[f'season_{y}_full']):,} rows")

Created season_2021_full with 712,320 rows
Created season_2022_full with 710,210 rows
Created season_2023_full with 720,684 rows
Created season_2024_full with 711,898 rows
Created season_2025_full with 712,528 rows


In [13]:
for y in range(2021, 2026):
    qc, summary = qc_missing_halves(globals()[f"season_{y}_full"])
    print(f"\n==== {y} ====")
    print(summary)

    # If anything fails, show the offenders
    bad = qc[qc["all_top_or_all_bot"]]
    if len(bad):
        print(bad.head(20))


==== 2021 ====
{'total_games': 2429, 'missing_top': 0, 'missing_bot': 0, 'all_top_or_all_bot': 0}

==== 2022 ====
{'total_games': 2430, 'missing_top': 0, 'missing_bot': 0, 'all_top_or_all_bot': 0}

==== 2023 ====
{'total_games': 2430, 'missing_top': 0, 'missing_bot': 0, 'all_top_or_all_bot': 0}

==== 2024 ====
{'total_games': 2429, 'missing_top': 0, 'missing_bot': 0, 'all_top_or_all_bot': 0}

==== 2025 ====
{'total_games': 2430, 'missing_top': 0, 'missing_bot': 0, 'all_top_or_all_bot': 0}


### Number of Games

# TODO: Add Explaination

In [14]:
seasons = {
    2021: season_2021,
    2022: season_2022,
    2023: season_2023,
    2024: season_2024,
    2025: season_2025,
}


rows = []
for year, df in seasons.items():
    games = df[["game_id", "home_team", "away_team"]].drop_duplicates()

    long = pd.concat(
        [
            games[["game_id", "home_team"]].rename(columns={"home_team": "team"}),
            games[["game_id", "away_team"]].rename(columns={"away_team": "team"}),
        ],
        ignore_index=True,
    ).drop_duplicates()

    long["season"] = year
    rows.append(long)

games_per_team_all_years = (
    pd.concat(rows, ignore_index=True)
      .groupby(["team", "season"])["game_id"]
      .nunique()
      .unstack("season")            # columns = seasons
      .sort_index()                 # sort teams A-Z
      .reset_index()                # make team a column (optional)
)

games_per_team_all_years

season,team,2021,2022,2023,2024,2025
0,ATH,162,162,162,162,162
1,ATL,161,162,162,162,162
2,AZ,162,162,162,162,162
3,BAL,162,162,162,162,162
4,BOS,162,162,162,162,162
5,CHC,162,162,162,162,162
6,CIN,162,162,162,162,162
7,CLE,162,162,162,161,162
8,COL,161,162,162,162,162
9,CWS,162,162,162,162,162


#### Game Number Analysis 

Most games have 162 games. However, the Astros and Guardians have only 161. This is not an error, as confirmed by (Astros 2024 Baseball Reference)[https://www.baseball-reference.com/teams/HOU/2024.shtml] and (Guardians 2024)[https://www.baseball-reference.com/teams/CLE/2024.shtml]

### Game Outcomes

Gets game outcomes and examines the rare case of ties. 

In [15]:
TIE_AUDIT_COLS = [
    "game_id", "game_date", "home_team", "away_team",
    "final_home_score", "final_away_score", "home_win", "run_diff",
    "is_tie", "has_missing_final", "is_bad_game",
]

for y in range(2022, 2026):
    season_df = globals()[f"season_{y}_full"]

    outcomes = make_game_outcomes_from_statcast_maxscore(
        season_df,
        game_id_col="game_id",
        home_score_col="post_home_score",
        away_score_col="post_away_score",
    )

    # audit flags
    outcomes["is_tie"] = outcomes["final_home_score"] == outcomes["final_away_score"]
    outcomes["has_missing_final"] = outcomes[["final_home_score", "final_away_score"]].isna().any(axis=1)
    outcomes["is_bad_game"] = outcomes["is_tie"] | outcomes["has_missing_final"]

    # store outcomes
    globals()[f"game_outcomes_{y}"] = outcomes

    # store audit dfs
    bad_games = outcomes.loc[outcomes["is_bad_game"], [c for c in TIE_AUDIT_COLS if c in outcomes.columns]] \
                        .sort_values(["game_date", "home_team"], na_position="last") \
                        .reset_index(drop=True)

    tied_games = outcomes.loc[outcomes["is_tie"], [c for c in TIE_AUDIT_COLS if c in outcomes.columns]] \
                         .sort_values(["game_date", "home_team"], na_position="last") \
                         .reset_index(drop=True)

    globals()[f"bad_games_{y}"] = bad_games
    globals()[f"tied_games_{y}"] = tied_games

    print(
        f"{y}: {len(outcomes):,} games | "
        f"ties: {outcomes['is_tie'].sum()} | "
        f"missing finals: {outcomes['has_missing_final'].sum()} | "
        f"bad games total: {outcomes['is_bad_game'].sum()}"
    )


2022: 2,430 games | ties: 1 | missing finals: 0 | bad games total: 1
2023: 2,430 games | ties: 2 | missing finals: 0 | bad games total: 2
2024: 2,429 games | ties: 0 | missing finals: 0 | bad games total: 0
2025: 2,430 games | ties: 0 | missing finals: 0 | bad games total: 0


In [16]:
display(HTML("<h4>Season 2022</h4>")); display(game_outcomes_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(game_outcomes_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(game_outcomes_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(game_outcomes_2025.head(5))

Unnamed: 0,game_id,game_date,home_team,away_team,final_home_score,final_away_score,home_win,run_diff,is_tie,has_missing_final,is_bad_game
0,661032,2022-04-26,LAA,CLE,4,1,1,3,False,False,False
1,661033,2022-04-24,LAA,BAL,7,6,1,1,False,False,False
2,661034,2022-04-25,LAA,CLE,3,0,1,3,False,False,False
3,661035,2022-04-23,LAA,BAL,4,5,0,-1,False,False,False
4,661036,2022-04-11,LAA,MIA,6,2,1,4,False,False,False


Unnamed: 0,game_id,game_date,home_team,away_team,final_home_score,final_away_score,home_win,run_diff,is_tie,has_missing_final,is_bad_game
0,716352,2023-10-01,KC,NYY,5,2,1,3,False,False,False
1,716353,2023-10-01,STL,CIN,4,3,1,1,False,False,False
2,716354,2023-10-01,ATL,WSH,9,10,0,-1,False,False,False
3,716355,2023-10-01,NYM,PHI,1,9,0,-8,False,False,False
4,716356,2023-10-01,CWS,SD,1,2,0,-1,False,False,False


Unnamed: 0,game_id,game_date,home_team,away_team,final_home_score,final_away_score,home_win,run_diff,is_tie,has_missing_final,is_bad_game
0,744795,2024-09-25,WSH,KC,0,3,0,-3,False,False,False
1,744796,2024-09-26,WSH,KC,4,7,0,-3,False,False,False
2,744797,2024-09-27,WSH,PHI,9,1,1,8,False,False,False
3,744798,2024-09-29,WSH,PHI,3,6,0,-3,False,False,False
4,744799,2024-09-28,WSH,PHI,6,3,1,3,False,False,False


Unnamed: 0,game_id,game_date,home_team,away_team,final_home_score,final_away_score,home_win,run_diff,is_tie,has_missing_final,is_bad_game
0,776135,2025-09-28,LAA,HOU,2,6,0,-4,False,False,False
1,776136,2025-09-28,SD,AZ,12,4,1,8,False,False,False
2,776137,2025-09-28,SF,COL,4,0,1,4,False,False,False
3,776138,2025-09-28,ATH,KC,2,9,0,-7,False,False,False
4,776139,2025-09-28,SEA,LAD,1,6,0,-5,False,False,False


### Identifying Tied Games


In [17]:
all_tied_games = pd.concat(
    {y: globals()[f"tied_games_{y}"] for y in range(2022, 2026)},
    names=["season_year", "row"]
).reset_index(level="row", drop=True).reset_index()

all_tied_games

Unnamed: 0,season_year,game_id,game_date,home_team,away_team,final_home_score,final_away_score,home_win,run_diff,is_tie,has_missing_final,is_bad_game
0,2022,663053,2022-07-08,CIN,TB,1,1,0,0,True,False,True
1,2023,717170,2023-08-01,KC,NYM,6,6,0,0,True,False,True
2,2023,716704,2023-09-05,KC,CWS,6,6,0,0,True,False,True


#### Note:

There are three games that are tied. Now, we will look into each individual game to figure out what is happening. 

### Individual Game Inspection

In [18]:
display(HTML("<h4>2022-07-08: </h4>")); display(inspect_game_timeline(season_2022_full, 663053, tail_n=20))
display(HTML("<h4>2023-08-01: </h4>")); display(inspect_game_timeline(season_2023_full, 717170, tail_n=20))
display(HTML("<h4>2023-09-05: </h4>")); display(inspect_game_timeline(season_2023_full, 716704, tail_n=20))


Unnamed: 0,game_id,topbot_flag,game_date,inning,inning_topbot,at_bat_number,pitch_number,batter_id,events,description,post_home_score,post_away_score
0,663053,0,2022-07-08,10,Top,69,3,666139,,ball,1,1
1,663053,0,2022-07-08,10,Top,69,4,666139,,ball,1,1
2,663053,0,2022-07-08,10,Top,69,5,666139,,ball,1,1
3,663053,0,2022-07-08,10,Top,69,6,666139,single,hit_into_play,1,1
4,663053,0,2022-07-08,10,Top,70,1,650490,fielders_choice_out,hit_into_play,1,1
5,663053,0,2022-07-08,10,Top,71,1,677551,,ball,1,1
6,663053,0,2022-07-08,10,Top,71,2,677551,,called_strike,1,1
7,663053,0,2022-07-08,10,Top,71,3,677551,,ball,1,1
8,663053,0,2022-07-08,10,Top,71,4,677551,double_play,hit_into_play,1,1
9,663053,1,2022-07-08,10,Bot,72,1,669222,,ball,1,1


Unnamed: 0,game_id,topbot_flag,game_date,inning,inning_topbot,at_bat_number,pitch_number,batter_id,events,description,post_home_score,post_away_score
0,717170,1,2023-08-01,10,Bot,81,7,677951,double,hit_into_play,5,6
1,717170,1,2023-08-01,10,Bot,82,1,669004,single,hit_into_play,6,6
2,717170,1,2023-08-01,10,Bot,83,1,521692,,ball,6,6
3,717170,1,2023-08-01,10,Bot,83,2,521692,,foul,6,6
4,717170,1,2023-08-01,10,Bot,83,3,521692,,swinging_strike,6,6
5,717170,1,2023-08-01,10,Bot,83,4,521692,,foul,6,6
6,717170,1,2023-08-01,10,Bot,83,5,521692,,foul,6,6
7,717170,1,2023-08-01,10,Bot,83,6,521692,,foul,6,6
8,717170,1,2023-08-01,10,Bot,83,7,521692,,foul,6,6
9,717170,1,2023-08-01,10,Bot,83,8,521692,,ball,6,6


Unnamed: 0,game_id,topbot_flag,game_date,inning,inning_topbot,at_bat_number,pitch_number,batter_id,events,description,post_home_score,post_away_score
0,716704,0,2023-09-05,9,Top,74,5,683734,,foul,5,6
1,716704,0,2023-09-05,9,Top,74,6,683734,strikeout,called_strike,5,6
2,716704,1,2023-09-05,9,Bot,75,1,679845,field_error,hit_into_play,5,6
3,716704,1,2023-09-05,9,Bot,76,1,686681,,called_strike,5,6
4,716704,1,2023-09-05,9,Bot,76,2,686681,,ball,5,6
5,716704,1,2023-09-05,9,Bot,76,3,686681,single,hit_into_play,5,6
6,716704,1,2023-09-05,9,Bot,77,1,672580,,called_strike,5,6
7,716704,1,2023-09-05,9,Bot,77,2,672580,,blocked_ball,5,6
8,716704,1,2023-09-05,9,Bot,77,3,672580,,ball,5,6
9,716704,1,2023-09-05,9,Bot,77,4,672580,force_out,hit_into_play,5,6


### Batter and Game Analysis

In [19]:
batter_ids = [571980, 671221, 669004]
batter_ids = [int(x) for x in batter_ids]

names = playerid_reverse_lookup(batter_ids, key_type="mlbam")
names[["key_mlbam", "name_first", "name_last"]]

Gathering player lookup table. This may take a moment.


Unnamed: 0,key_mlbam,name_first,name_last
0,669004,mj,melendez
1,571980,tyler,naquin
2,671221,drew,waters


**2022-07-08 (game_id: `663053`):** 

Official boxscore and play-by-play sources ([Baseball-Reference](https://www.baseball-reference.com/boxes/CIN/CIN202207080.shtml) and [Back to Baseball](https://backtobaseball.com/game/CIN202207080/cincinnati-reds/versus/tampa-bay-rays/2022/july/8/#text-play-section)) confirm that the game ended on a walk-off balk with Tyler Naquin (`571980`) at the plate. The balk occurred during his at-bat (count 2–1), but because no pitch was thrown at the moment of the balk, the event does not generate a pitch-level row in the Statcast feed. As a result, the `post_home_score` field does not reflect the game-winning run, and the Statcast-derived final score appears tied. To correct this discrepancy, we manually override the Cincinnati Reds’ final score to 2 to match the official result.

**2023-08-01 (game_id: `717170`):** 

Official boxscore and play-by-play sources ([Baseball-Reference](https://www.baseball-reference.com/boxes/KCA/KCA202308010.shtml) and [Back to Baseball](https://backtobaseball.com/game/KCA202308010/kansas-city-royals/versus/new-york-mets/2023/august/1/#text-play-section)) confirm that the game ended on a walk-off balk with Michael Massey at the plate. The balk occurred at a 0–0 count, meaning no pitch was thrown to Massey. Because Statcast pitch-level data only generates rows when a pitch occurs, no at-bat entry is recorded for Massey, and the final recorded event in the dataset is Drew Waters’ walk (`671221`). As a result, the `post_home_score` field does not capture the game-winning run from the balk. This explains why the pitch-level feed appears to end in a tie, and we manually override the final score to reflect the official result (Royals 7, Mets 6).

**2023-09-05 (game_id: 716704):** 

Official boxscore and play-by-play sources ([Baseball-Reference](https://www.baseball-reference.com/boxes/KCA/KCA202309050.shtml) and [Back to Baseball](https://backtobaseball.com/game/KCA202309050/kansas-city-royals/versus/chicago-white-sox/2023/september/5/#text-play-section)) confirm that the game ended on a walk-off balk in the bottom of the 9th. MJ Melendez (MLBAM 669004) had just been intentionally walked (at-bat 80), loading the bases, and the next batter (Olivares) came to the plate with a 0–0 count. Before a pitch was thrown, the umpire called a balk on Gregory Santos, allowing the winning run to score. Because no pitch occurred during that plate appearance, Statcast did not generate a pitch-level row capturing the balk, and the `post_home_score` field remains tied at 6–6 in the dataset. As a result, the pitch-level feed appears to end in a tie, and we manually override the final score to reflect the official result (Royals 7, White Sox 6).

### Updating Game Scores

In [20]:
FINAL_SCORE_OVERRIDES = {
    2022: {
        663053: {"final_home_score": 2, "final_away_score": 1},
    },
    2023: {
        717170: {"final_home_score": 7, "final_away_score": 6},
        716704: {"final_home_score": 7, "final_away_score": 6},
    },
}

In [21]:
for y in range(2022, 2026):
    season_df = globals()[f"season_{y}_full"]

    outcomes = make_game_outcomes_from_statcast_maxscore(
        season_df,
        game_id_col="game_id",
        home_score_col="post_home_score",
        away_score_col="post_away_score",
    )

    # Apply overrides if they exist for that season
    if y in FINAL_SCORE_OVERRIDES:
        for gid, scores in FINAL_SCORE_OVERRIDES[y].items():
            mask = outcomes["game_id"] == gid
            if mask.sum() == 0:
                raise ValueError(f"{gid} not found in {y} outcomes")

            outcomes.loc[mask, "final_home_score"] = scores["final_home_score"]
            outcomes.loc[mask, "final_away_score"] = scores["final_away_score"]

    # Recompute derived fields AFTER overrides
    outcomes["home_win"] = (outcomes["final_home_score"] > outcomes["final_away_score"]).astype(int)
    outcomes["run_diff"] = outcomes["final_home_score"] - outcomes["final_away_score"]

    outcomes["is_tie"] = outcomes["final_home_score"] == outcomes["final_away_score"]
    outcomes["has_missing_final"] = outcomes[["final_home_score", "final_away_score"]].isna().any(axis=1)
    outcomes["is_bad_game"] = outcomes["is_tie"] | outcomes["has_missing_final"]

    globals()[f"game_outcomes_{y}"] = outcomes

    print(
        f"{y}: {len(outcomes):,} games | "
        f"ties: {outcomes['is_tie'].sum()} | "
        f"bad games: {outcomes['is_bad_game'].sum()}"
    )

2022: 2,430 games | ties: 0 | bad games: 0
2023: 2,430 games | ties: 0 | bad games: 0
2024: 2,429 games | ties: 0 | bad games: 0
2025: 2,430 games | ties: 0 | bad games: 0


#### Note:

Now we have no ties! We will eventually merge these into the batting and pitching dataframes.  

## Dropping Columns

Below, I retain only the columns needed for calculating our batting and pitching features.


In [22]:
cols_to_keep = [
    "game_id",
    "game_date",
    "home_team",
    "away_team",
    "inning",
    "inning_topbot",
    "pitch_number",
    "outs_when_up",
    "home_score",
    "away_score",
    "events",
    "description",
    "batter_id",
    "pitcher_id",
    "pitcher_name"
]

for y in range(2021, 2026):
    name = f"season_{y}"
    df = globals().get(name)
    if df is None:
        print(f"{name}: (not loaded)")
        continue

    # Keep only relevant columns (skip missing ones safely)
    available = [c for c in cols_to_keep if c in df.columns]
    globals()[name] = df[available]

    print(f"{name}: kept {len(available)} columns")

season_2021: kept 15 columns
season_2022: kept 15 columns
season_2023: kept 15 columns
season_2024: kept 15 columns
season_2025: kept 15 columns


In [23]:
display(HTML("<h4>Season 2021</h4>")); display(season_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(season_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(season_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(season_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(season_2025.head(5))

Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name
0,632254,2021-10-03,ATL,NYM,9,Top,4,2,5,0,field_out,hit_into_play,596019,519293,"Smith, Will"
1,632254,2021-10-03,ATL,NYM,9,Top,3,2,5,0,,foul,596019,519293,"Smith, Will"
2,632254,2021-10-03,ATL,NYM,9,Top,2,2,5,0,,foul,596019,519293,"Smith, Will"
3,632254,2021-10-03,ATL,NYM,9,Top,1,2,5,0,,ball,596019,519293,"Smith, Will"
4,632254,2021-10-03,ATL,NYM,9,Top,2,1,5,0,field_out,hit_into_play,607043,519293,"Smith, Will"


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name
0,663451,2022-10-05,BAL,TOR,9,Top,1,2,5,4,field_out,hit_into_play,624415,641329,"Baker, Bryan"
1,663451,2022-10-05,BAL,TOR,9,Top,5,1,5,4,strikeout,swinging_strike,643376,641329,"Baker, Bryan"
2,663451,2022-10-05,BAL,TOR,9,Top,4,1,5,4,,ball,643376,641329,"Baker, Bryan"
3,663451,2022-10-05,BAL,TOR,9,Top,3,1,5,4,,swinging_strike,643376,641329,"Baker, Bryan"
4,663451,2022-10-05,BAL,TOR,9,Top,2,1,5,4,,called_strike,643376,641329,"Baker, Bryan"


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name
0,716367,2023-10-01,BAL,BOS,9,Bot,6,2,1,6,field_out,hit_into_play,677008,687798,"Robertson, Nick"
1,716367,2023-10-01,BAL,BOS,9,Bot,5,2,1,6,,foul,677008,687798,"Robertson, Nick"
2,716367,2023-10-01,BAL,BOS,9,Bot,4,2,1,6,,ball,677008,687798,"Robertson, Nick"
3,716367,2023-10-01,BAL,BOS,9,Bot,3,2,1,6,,ball,677008,687798,"Robertson, Nick"
4,716367,2023-10-01,BAL,BOS,9,Bot,2,2,1,6,,swinging_strike,677008,687798,"Robertson, Nick"


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name
0,747139,2024-09-30,ATL,NYM,9,Bot,5,2,7,8,field_out,hit_into_play,518595,621242,"Díaz, Edwin"
1,747139,2024-09-30,ATL,NYM,9,Bot,4,2,7,8,,ball,518595,621242,"Díaz, Edwin"
2,747139,2024-09-30,ATL,NYM,9,Bot,3,2,7,8,,swinging_strike,518595,621242,"Díaz, Edwin"
3,747139,2024-09-30,ATL,NYM,9,Bot,2,2,7,8,,ball,518595,621242,"Díaz, Edwin"
4,747139,2024-09-30,ATL,NYM,9,Bot,1,2,7,8,,swinging_strike,518595,621242,"Díaz, Edwin"


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name
0,776151,2025-09-28,BOS,DET,9,Top,1,2,4,3,field_out,hit_into_play,678009,669711,"Weissert, Greg"
1,776151,2025-09-28,BOS,DET,9,Top,4,1,4,3,strikeout,called_strike,668670,669711,"Weissert, Greg"
2,776151,2025-09-28,BOS,DET,9,Top,3,1,4,3,,foul,668670,669711,"Weissert, Greg"
3,776151,2025-09-28,BOS,DET,9,Top,2,1,4,3,,swinging_strike,668670,669711,"Weissert, Greg"
4,776151,2025-09-28,BOS,DET,9,Top,1,1,4,3,,called_strike,668670,669711,"Weissert, Greg"


### Starting Pitcher Indicator

First, we create an indicator to identify the starting pitcher for each game in every season. This allows us to separate starter performance from bullpen performance when constructing pitching features. 

Furthermore, we also have an indicator for a pitcher who started the full game. This will be useful for diagnosing missing games later.


In [24]:
for y in range(2021, 2026):
    name = f"season_{y}"
    df = globals().get(name)
    if df is None:
        print(f"{name}: (not found)")
        continue

    # 1) enforce starter identity on pitch-level data
    df = add_starter_indicator_pitchlevel(df)

    # 2) add "starter pitched full game" indicator (team used exactly one pitcher)
    df = add_starter_full_game_indicator(df)

    globals()[name] = df
    print(f"{name}: starter enforced + starter_full_game added")

season_2021: starter enforced + starter_full_game added
season_2022: starter enforced + starter_full_game added
season_2023: starter enforced + starter_full_game added
season_2024: starter enforced + starter_full_game added
season_2025: starter enforced + starter_full_game added


In [25]:
display(HTML("<h4>Season 2021</h4>")); display(season_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(season_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(season_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(season_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(season_2025.head(5))

Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name,pitching_team,is_starter,starter_full_game
0,632169,2021-04-10,SF,COL,1,Bot,1,0,0,0,,blocked_ball,543105,592346,"González, Chi Chi",COL,1,0
1,632169,2021-04-10,SF,COL,1,Bot,1,0,0,0,,ball,573262,592346,"González, Chi Chi",COL,1,0
2,632169,2021-04-10,SF,COL,1,Bot,1,0,0,0,,called_strike,600303,592346,"González, Chi Chi",COL,1,0
3,632169,2021-04-10,SF,COL,1,Bot,2,0,0,0,field_out,hit_into_play,543105,592346,"González, Chi Chi",COL,1,0
4,632169,2021-04-10,SF,COL,1,Bot,2,0,0,0,,ball,573262,592346,"González, Chi Chi",COL,1,0


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name,pitching_team,is_starter,starter_full_game
0,661032,2022-04-26,LAA,CLE,1,Bot,1,0,0,0,,called_strike,621493,663474,"McKenzie, Triston",CLE,1,0
1,661032,2022-04-26,LAA,CLE,1,Bot,2,0,0,0,,called_strike,621493,663474,"McKenzie, Triston",CLE,1,0
2,661032,2022-04-26,LAA,CLE,1,Bot,3,0,0,0,,foul,621493,663474,"McKenzie, Triston",CLE,1,0
3,661032,2022-04-26,LAA,CLE,1,Bot,4,0,0,0,,foul,621493,663474,"McKenzie, Triston",CLE,1,0
4,661032,2022-04-26,LAA,CLE,1,Bot,5,0,0,0,field_out,hit_into_play,621493,663474,"McKenzie, Triston",CLE,1,0


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name,pitching_team,is_starter,starter_full_game
0,716352,2023-10-01,KC,NYY,1,Top,1,0,0,0,grounded_into_double_play,hit_into_play,543309,425844,"Greinke, Zack",KC,1,0
1,716352,2023-10-01,KC,NYY,1,Top,1,0,0,0,,foul,669224,425844,"Greinke, Zack",KC,1,0
2,716352,2023-10-01,KC,NYY,1,Top,1,0,0,0,,foul,683011,425844,"Greinke, Zack",KC,1,0
3,716352,2023-10-01,KC,NYY,1,Top,1,0,0,0,,called_strike,518934,425844,"Greinke, Zack",KC,1,0
4,716352,2023-10-01,KC,NYY,1,Top,2,0,0,0,single,hit_into_play,669224,425844,"Greinke, Zack",KC,1,0


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name,pitching_team,is_starter,starter_full_game
0,744795,2024-09-25,WSH,KC,1,Bot,1,0,0,0,,ball,686611,547179,"Lorenzen, Michael",KC,1,0
1,744795,2024-09-25,WSH,KC,1,Bot,2,0,0,0,,ball,686611,547179,"Lorenzen, Michael",KC,1,0
2,744795,2024-09-25,WSH,KC,1,Bot,3,0,0,0,,foul,686611,547179,"Lorenzen, Michael",KC,1,0
3,744795,2024-09-25,WSH,KC,1,Bot,4,0,0,0,field_out,hit_into_play,686611,547179,"Lorenzen, Michael",KC,1,0
4,744795,2024-09-25,WSH,KC,1,Bot,1,1,0,0,,ball,695578,547179,"Lorenzen, Michael",KC,1,0


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name,pitching_team,is_starter,starter_full_game
0,776135,2025-09-28,LAA,HOU,1,Bot,1,0,0,1,,swinging_strike,650859,621121,"McCullers Jr., Lance",HOU,1,0
1,776135,2025-09-28,LAA,HOU,1,Bot,2,0,0,1,,called_strike,650859,621121,"McCullers Jr., Lance",HOU,1,0
2,776135,2025-09-28,LAA,HOU,1,Bot,3,0,0,1,,foul,650859,621121,"McCullers Jr., Lance",HOU,1,0
3,776135,2025-09-28,LAA,HOU,1,Bot,4,0,0,1,,ball,650859,621121,"McCullers Jr., Lance",HOU,1,0
4,776135,2025-09-28,LAA,HOU,1,Bot,5,0,0,1,field_out,hit_into_play,650859,621121,"McCullers Jr., Lance",HOU,1,0


### Unique Events and Descriptions

**TODO**: Add explaination

In [26]:
for y in range(2021, 2026):
    df = globals().get(f"season_{y}")
    if df is None:
        print(f"season_{y}: (not loaded)")
        continue

    events_unique = sorted(df["events"].dropna().astype(str).unique())
    desc_unique   = sorted(df["description"].dropna().astype(str).unique())

    print(f"\n=== season_{y} ===")
    print(f"events unique ({len(events_unique)}):")
    print(events_unique)
    print(f"\ndescription unique ({len(desc_unique)}):")
    print(desc_unique)


=== season_2021 ===
events unique (23):
['catcher_interf', 'double', 'double_play', 'field_error', 'field_out', 'fielders_choice', 'fielders_choice_out', 'force_out', 'grounded_into_double_play', 'hit_by_pitch', 'home_run', 'intent_walk', 'sac_bunt', 'sac_bunt_double_play', 'sac_fly', 'sac_fly_double_play', 'single', 'strikeout', 'strikeout_double_play', 'triple', 'triple_play', 'truncated_pa', 'walk']

description unique (15):
['automatic_ball', 'ball', 'blocked_ball', 'bunt_foul_tip', 'called_strike', 'foul', 'foul_bunt', 'foul_pitchout', 'foul_tip', 'hit_by_pitch', 'hit_into_play', 'missed_bunt', 'pitchout', 'swinging_strike', 'swinging_strike_blocked']

=== season_2022 ===
events unique (22):
['catcher_interf', 'double', 'double_play', 'field_error', 'field_out', 'fielders_choice', 'fielders_choice_out', 'force_out', 'grounded_into_double_play', 'hit_by_pitch', 'home_run', 'intent_walk', 'sac_bunt', 'sac_fly', 'sac_fly_double_play', 'single', 'strikeout', 'strikeout_double_play', 

## Pitcher Metrics

Now, we can begin creating the features needed for modeling. We start by constructing pitching features based on the **starting pitcher**, including:

- **FIP**
- **WHIP**
- **K9**
- **HR9**

For each metric, we compute rolling **3-day** and **7-day** values and then calculate the difference between the home and away teams. In addition, we compute rolling **FIP** for the remaining bullpen.

Formal definitions of each pitching metric and implementation details are provided in a later section.


### Plate Appearances

Below, I filter the Statcast pitch-level data to retain only **plate appearance–ending** events (e.g., hits, walks, strikeouts, outs, and sacrifices). This ensures each plate appearance is counted once and excludes incomplete plate appearances labeled as `truncated_pa`. These filtered plate appearances are then used to calculate features for both pitchers and batters later in the notebook.


In [27]:
for y in range(2021, 2026):
    season_name = f"season_{y}"
    pa_name = f"pa_{y}"

    df = globals().get(season_name)
    if df is None:
        print(f"{season_name}: (not loaded)")
        continue

    globals()[pa_name] = filter_plate_appearances(df)
    print(f"{pa_name}: {len(globals()[pa_name]):,} rows kept")


pa_2021: 181,816 rows kept
pa_2022: 182,044 rows kept
pa_2023: 184,104 rows kept
pa_2024: 182,440 rows kept
pa_2025: 182,926 rows kept


In [28]:
display(HTML("<h4>Season 2021</h4>")); display(pa_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(pa_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(pa_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(pa_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(pa_2025.head(5))

Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name,pitching_team,is_starter,starter_full_game
3,632169,2021-04-10,SF,COL,1,Bot,2,0,0,0,field_out,hit_into_play,543105,592346,"González, Chi Chi",COL,1,0
14,632169,2021-04-10,SF,COL,1,Bot,7,0,0,0,walk,ball,573262,592346,"González, Chi Chi",COL,1,0
15,632169,2021-04-10,SF,COL,1,Bot,7,0,0,0,single,hit_into_play,600303,592346,"González, Chi Chi",COL,1,0
19,632169,2021-04-10,SF,COL,1,Bot,4,1,0,0,field_out,hit_into_play,474832,592346,"González, Chi Chi",COL,1,0
27,632169,2021-04-10,SF,COL,1,Bot,8,2,0,0,field_out,hit_into_play,446334,592346,"González, Chi Chi",COL,1,0


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name,pitching_team,is_starter,starter_full_game
0,661032,2022-04-26,LAA,CLE,1,Bot,5,0,0,0,field_out,hit_into_play,621493,663474,"McKenzie, Triston",CLE,1,0
1,661032,2022-04-26,LAA,CLE,1,Bot,1,1,0,0,field_out,hit_into_play,660271,663474,"McKenzie, Triston",CLE,1,0
2,661032,2022-04-26,LAA,CLE,1,Bot,5,2,0,0,strikeout,called_strike,545361,663474,"McKenzie, Triston",CLE,1,0
3,661032,2022-04-26,LAA,CLE,2,Bot,2,0,0,0,field_out,hit_into_play,665120,663474,"McKenzie, Triston",CLE,1,0
4,661032,2022-04-26,LAA,CLE,2,Bot,1,1,0,0,double,hit_into_play,543685,663474,"McKenzie, Triston",CLE,1,0


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name,pitching_team,is_starter,starter_full_game
0,716352,2023-10-01,KC,NYY,1,Top,1,0,0,0,grounded_into_double_play,hit_into_play,543309,425844,"Greinke, Zack",KC,1,0
4,716352,2023-10-01,KC,NYY,1,Top,2,0,0,0,single,hit_into_play,669224,425844,"Greinke, Zack",KC,1,0
7,716352,2023-10-01,KC,NYY,1,Top,3,0,0,0,single,hit_into_play,683011,425844,"Greinke, Zack",KC,1,0
15,716352,2023-10-01,KC,NYY,1,Top,10,0,0,0,single,hit_into_play,518934,425844,"Greinke, Zack",KC,1,0
18,716352,2023-10-01,KC,NYY,1,Top,3,2,0,0,field_out,hit_into_play,643396,425844,"Greinke, Zack",KC,1,0


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name,pitching_team,is_starter,starter_full_game
0,744795,2024-09-25,WSH,KC,1,Bot,4,0,0,0,field_out,hit_into_play,686611,547179,"Lorenzen, Michael",KC,1,0
1,744795,2024-09-25,WSH,KC,1,Bot,6,1,0,0,strikeout,swinging_strike,695578,547179,"Lorenzen, Michael",KC,1,0
2,744795,2024-09-25,WSH,KC,1,Bot,3,2,0,0,field_out,hit_into_play,671277,547179,"Lorenzen, Michael",KC,1,0
3,744795,2024-09-25,WSH,KC,2,Bot,1,0,0,0,field_out,hit_into_play,608336,547179,"Lorenzen, Michael",KC,1,0
4,744795,2024-09-25,WSH,KC,2,Bot,4,0,0,0,walk,ball,677588,547179,"Lorenzen, Michael",KC,1,0


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_id,pitcher_name,pitching_team,is_starter,starter_full_game
4,776135,2025-09-28,LAA,HOU,1,Bot,5,0,0,1,field_out,hit_into_play,650859,621121,"McCullers Jr., Lance",HOU,1,0
11,776135,2025-09-28,LAA,HOU,1,Bot,4,1,1,1,strikeout,called_strike,666176,621121,"McCullers Jr., Lance",HOU,1,0
14,776135,2025-09-28,LAA,HOU,1,Bot,6,1,0,1,home_run,hit_into_play,545361,621121,"McCullers Jr., Lance",HOU,1,0
18,776135,2025-09-28,LAA,HOU,1,Bot,4,2,1,1,field_out,hit_into_play,621035,621121,"McCullers Jr., Lance",HOU,1,0
22,776135,2025-09-28,LAA,HOU,2,Bot,4,0,1,1,field_out,hit_into_play,695681,621121,"McCullers Jr., Lance",HOU,1,0


### Pitching Indicators

Next, we create indicator variables needed to compute rolling **FIP**, **WHIP**, **K9**, and **HR9**. These pitching metrics are defined as follows:

$$\text{WHIP} = \frac{H + BB + HBP}{IP}$$

$$\text{K/9} = \frac{9 \times K}{IP}$$

$$\text{HR/9} = \frac{9 \times HR}{IP}$$

$$\text{FIP} = \frac{13 \times HR + 3 \times (BB + HBP) - 2 \times K}{IP}$$


where **IP** denotes innings pitched, computed as total outs divided by three. These formulas are applied to rolling aggregates of the underlying event indicators to construct the final pitching features.

We also create the `did_not_end_pa` flag to preserve complete starter and bullpen coverage at the game level. Although these rows are excluded from rate-stat aggregation via `is_pa_countable`, they are intentionally retained to prevent missing pitcher records during game-level aggregation.


**NOTE TO SELF**: Explain what FIP, WHIP, HR9, and K9 are earlier in the notebook, pferable at begining of pitching section

In [29]:
for y in range(2021, 2026):
    src_name = f"pa_{y}"
    dst_name = f"pa_pitcher_{y}"

    df = globals().get(src_name)
    if df is None:
        print(f"{src_name}: (not found)")
        continue

    globals()[dst_name] = add_pitching_indicators(df.copy())
    print(f"{dst_name}: indicators added (from {src_name})")


pa_pitcher_2021: indicators added (from pa_2021)
pa_pitcher_2022: indicators added (from pa_2022)
pa_pitcher_2023: indicators added (from pa_2023)
pa_pitcher_2024: indicators added (from pa_2024)
pa_pitcher_2025: indicators added (from pa_2025)


In [30]:
display(HTML("<h4>Season 2021</h4>")); display(pa_pitcher_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(pa_pitcher_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(pa_pitcher_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(pa_pitcher_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(pa_pitcher_2025.head(5))

Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
3,632169,2021-04-10,SF,COL,1,Bot,2,0,0,0,...,1,0,0,True,0,0,0,0,0,1
14,632169,2021-04-10,SF,COL,1,Bot,7,0,0,0,...,1,0,0,True,0,1,0,0,0,0
15,632169,2021-04-10,SF,COL,1,Bot,7,0,0,0,...,1,0,0,True,0,0,0,0,1,0
19,632169,2021-04-10,SF,COL,1,Bot,4,1,0,0,...,1,0,0,True,0,0,0,0,0,1
27,632169,2021-04-10,SF,COL,1,Bot,8,2,0,0,...,1,0,0,True,0,0,0,0,0,1


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
0,661032,2022-04-26,LAA,CLE,1,Bot,5,0,0,0,...,1,0,0,True,0,0,0,0,0,1
1,661032,2022-04-26,LAA,CLE,1,Bot,1,1,0,0,...,1,0,0,True,0,0,0,0,0,1
2,661032,2022-04-26,LAA,CLE,1,Bot,5,2,0,0,...,1,0,0,True,0,0,0,1,0,1
3,661032,2022-04-26,LAA,CLE,2,Bot,2,0,0,0,...,1,0,0,True,0,0,0,0,0,1
4,661032,2022-04-26,LAA,CLE,2,Bot,1,1,0,0,...,1,0,0,True,0,0,0,0,1,0


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
0,716352,2023-10-01,KC,NYY,1,Top,1,0,0,0,...,1,0,0,True,0,0,0,0,0,2
4,716352,2023-10-01,KC,NYY,1,Top,2,0,0,0,...,1,0,0,True,0,0,0,0,1,0
7,716352,2023-10-01,KC,NYY,1,Top,3,0,0,0,...,1,0,0,True,0,0,0,0,1,0
15,716352,2023-10-01,KC,NYY,1,Top,10,0,0,0,...,1,0,0,True,0,0,0,0,1,0
18,716352,2023-10-01,KC,NYY,1,Top,3,2,0,0,...,1,0,0,True,0,0,0,0,0,1


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
0,744795,2024-09-25,WSH,KC,1,Bot,4,0,0,0,...,1,0,0,True,0,0,0,0,0,1
1,744795,2024-09-25,WSH,KC,1,Bot,6,1,0,0,...,1,0,0,True,0,0,0,1,0,1
2,744795,2024-09-25,WSH,KC,1,Bot,3,2,0,0,...,1,0,0,True,0,0,0,0,0,1
3,744795,2024-09-25,WSH,KC,2,Bot,1,0,0,0,...,1,0,0,True,0,0,0,0,0,1
4,744795,2024-09-25,WSH,KC,2,Bot,4,0,0,0,...,1,0,0,True,0,1,0,0,0,0


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
4,776135,2025-09-28,LAA,HOU,1,Bot,5,0,0,1,...,1,0,0,True,0,0,0,0,0,1
11,776135,2025-09-28,LAA,HOU,1,Bot,4,1,1,1,...,1,0,0,True,0,0,0,1,0,1
14,776135,2025-09-28,LAA,HOU,1,Bot,6,1,0,1,...,1,0,0,True,1,0,0,0,1,0
18,776135,2025-09-28,LAA,HOU,1,Bot,4,2,1,1,...,1,0,0,True,0,0,0,0,0,1
22,776135,2025-09-28,LAA,HOU,2,Bot,4,0,1,1,...,1,0,0,True,0,0,0,0,0,1


### Splitting Starters and Bullpen

To construct pitching features, we separate **starting pitcher** plate appearances from those thrown by the **bullpen**. 

By splitting the data into starter and bullpen subsets, we can compute rolling metrics for the starting pitcher at the individual level, while separately aggregating bullpen performance at the team level. This ensures that each set of pitching features accurately reflects the intended pitcher role and avoids mixing starter and relief appearances.

Before splitting starters and bullpens, we examine the number of complete games thrown by starters. In games where a starter goes the distance, there will be no corresponding bullpen entry for that team. Identifying these cases upfront ensures we understand any structural differences in the resulting dataframes and prevents unintended row mismatches during merging.

-----

**Move this description elsewhere**: This distinction is necessary because starting pitchers and relievers play fundamentally different roles, and their contributions are used differently when computing rolling pitching metrics.



In [31]:
# Run for 2022–2025
for year, df in {
    2022: pa_pitcher_2022,
    2023: pa_pitcher_2023,
    2024: pa_pitcher_2024,
    2025: pa_pitcher_2025,
}.items():
    out = summarize_complete_games(df)
    print(f"\n=== {year} ===")
    print(f"Games in df: {out['n_games']}")
    print(f"Team starter-complete-games (team-game occurrences): {out['n_team_starter_complete_games']}")
    print(f"Games where BOTH starters went the distance: {out['n_games_both_starters_complete']}")
    print(f"Games where AT LEAST ONE starter went the distance: {out['n_games_any_starter_complete']}")


=== 2022 ===
Games in df: 2430
Team starter-complete-games (team-game occurrences): 36
Games where BOTH starters went the distance: 2
Games where AT LEAST ONE starter went the distance: 34

=== 2023 ===
Games in df: 2430
Team starter-complete-games (team-game occurrences): 35
Games where BOTH starters went the distance: 1
Games where AT LEAST ONE starter went the distance: 34

=== 2024 ===
Games in df: 2429
Team starter-complete-games (team-game occurrences): 28
Games where BOTH starters went the distance: 1
Games where AT LEAST ONE starter went the distance: 27

=== 2025 ===
Games in df: 2430
Team starter-complete-games (team-game occurrences): 29
Games where BOTH starters went the distance: 1
Games where AT LEAST ONE starter went the distance: 28


#### Analysis

Across all seasons, complete games by starters are relatively rare, ranging from 27 to 34 games per year in which at least one starter went the distance. Team-level complete-game occurrences range from 28 to 36 per season, with only 1–2 games per year featuring both starters completing the game. This confirms that the vast majority of games include bullpen appearances, and only a small fraction of team-game observations will lack a corresponding bullpen row. Structurally, starter–bullpen splits are therefore largely balanced; however, this small asymmetry must be explicitly addressed during merging to ensure that complete-game cases do not introduce unintended row mismatches or missing values in aggregated metrics.

### Split Execution

In [32]:
for y in range(2021, 2026):
    df = globals().get(f"pa_pitcher_{y}")
    if df is None:
        print(f"pa_pitcher_{y}: (not found)")
        continue

    starter_df, bullpen_df = split_starter_bullpen(df, validate=True)

    globals()[f"pa_starter_{y}"] = starter_df
    globals()[f"pa_bullpen_{y}"] = bullpen_df

    print(f"{y}: starter={len(starter_df):,} | bullpen={len(bullpen_df):,}")


2021: starter=103,581 | bullpen=78,235
2022: starter=106,549 | bullpen=75,495
2023: starter=106,334 | bullpen=77,770
2024: starter=106,931 | bullpen=75,509
2025: starter=106,529 | bullpen=76,397


In [33]:
display(HTML("<h4>Starting Pitcher Season 2021</h4>")); display(pa_starter_2021.head(5))
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(pa_starter_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(pa_starter_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(pa_starter_2024.head(5))
display(HTML("<h4>Starting PitcherSeason 2025</h4>")); display(pa_starter_2025.head(5))
print(" ")
display(HTML("<h4>Bullpen Pitcher Season 2021</h4>")); display(pa_bullpen_2021.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2022</h4>")); display(pa_bullpen_2022.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2023</h4>")); display(pa_bullpen_2023.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2024</h4>")); display(pa_bullpen_2024.head(5))
display(HTML("<h4>Bullpen PitcherSeason 2025</h4>")); display(pa_bullpen_2025.head(5))


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
3,632169,2021-04-10,SF,COL,1,Bot,2,0,0,0,...,1,0,0,True,0,0,0,0,0,1
14,632169,2021-04-10,SF,COL,1,Bot,7,0,0,0,...,1,0,0,True,0,1,0,0,0,0
15,632169,2021-04-10,SF,COL,1,Bot,7,0,0,0,...,1,0,0,True,0,0,0,0,1,0
19,632169,2021-04-10,SF,COL,1,Bot,4,1,0,0,...,1,0,0,True,0,0,0,0,0,1
27,632169,2021-04-10,SF,COL,1,Bot,8,2,0,0,...,1,0,0,True,0,0,0,0,0,1


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
0,661032,2022-04-26,LAA,CLE,1,Bot,5,0,0,0,...,1,0,0,True,0,0,0,0,0,1
1,661032,2022-04-26,LAA,CLE,1,Bot,1,1,0,0,...,1,0,0,True,0,0,0,0,0,1
2,661032,2022-04-26,LAA,CLE,1,Bot,5,2,0,0,...,1,0,0,True,0,0,0,1,0,1
3,661032,2022-04-26,LAA,CLE,2,Bot,2,0,0,0,...,1,0,0,True,0,0,0,0,0,1
4,661032,2022-04-26,LAA,CLE,2,Bot,1,1,0,0,...,1,0,0,True,0,0,0,0,1,0


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
0,716352,2023-10-01,KC,NYY,1,Top,1,0,0,0,...,1,0,0,True,0,0,0,0,0,2
4,716352,2023-10-01,KC,NYY,1,Top,2,0,0,0,...,1,0,0,True,0,0,0,0,1,0
7,716352,2023-10-01,KC,NYY,1,Top,3,0,0,0,...,1,0,0,True,0,0,0,0,1,0
15,716352,2023-10-01,KC,NYY,1,Top,10,0,0,0,...,1,0,0,True,0,0,0,0,1,0
18,716352,2023-10-01,KC,NYY,1,Top,3,2,0,0,...,1,0,0,True,0,0,0,0,0,1


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
0,744795,2024-09-25,WSH,KC,1,Bot,4,0,0,0,...,1,0,0,True,0,0,0,0,0,1
1,744795,2024-09-25,WSH,KC,1,Bot,6,1,0,0,...,1,0,0,True,0,0,0,1,0,1
2,744795,2024-09-25,WSH,KC,1,Bot,3,2,0,0,...,1,0,0,True,0,0,0,0,0,1
3,744795,2024-09-25,WSH,KC,2,Bot,1,0,0,0,...,1,0,0,True,0,0,0,0,0,1
4,744795,2024-09-25,WSH,KC,2,Bot,4,0,0,0,...,1,0,0,True,0,1,0,0,0,0


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
4,776135,2025-09-28,LAA,HOU,1,Bot,5,0,0,1,...,1,0,0,True,0,0,0,0,0,1
11,776135,2025-09-28,LAA,HOU,1,Bot,4,1,1,1,...,1,0,0,True,0,0,0,1,0,1
14,776135,2025-09-28,LAA,HOU,1,Bot,6,1,0,1,...,1,0,0,True,1,0,0,0,1,0
18,776135,2025-09-28,LAA,HOU,1,Bot,4,2,1,1,...,1,0,0,True,0,0,0,0,0,1
22,776135,2025-09-28,LAA,HOU,2,Bot,4,0,1,1,...,1,0,0,True,0,0,0,0,0,1


 


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
83,632169,2021-04-10,SF,COL,6,Bot,2,0,1,3,...,0,0,0,True,0,0,0,0,0,1
90,632169,2021-04-10,SF,COL,6,Bot,5,0,1,3,...,0,0,0,True,0,1,0,0,0,0
93,632169,2021-04-10,SF,COL,6,Bot,7,0,1,3,...,0,0,0,True,0,1,0,0,0,0
102,632169,2021-04-10,SF,COL,6,Bot,5,1,4,3,...,0,0,0,True,0,0,0,0,0,1
104,632169,2021-04-10,SF,COL,6,Bot,6,1,1,3,...,0,0,0,True,1,0,0,0,1,0


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
24,661032,2022-04-26,LAA,CLE,6,Bot,1,2,4,0,...,0,0,0,True,0,0,0,0,0,1
25,661032,2022-04-26,LAA,CLE,7,Bot,3,0,4,0,...,0,0,0,True,0,0,0,0,0,1
26,661032,2022-04-26,LAA,CLE,7,Bot,1,1,4,0,...,0,0,0,True,0,0,0,0,0,1
27,661032,2022-04-26,LAA,CLE,7,Bot,7,2,4,0,...,0,0,0,True,0,0,0,1,0,1
28,661032,2022-04-26,LAA,CLE,8,Bot,1,0,4,0,...,0,0,0,True,0,0,0,0,0,1


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
66,716352,2023-10-01,KC,NYY,6,Top,2,0,5,0,...,0,0,0,True,0,0,0,0,0,1
81,716352,2023-10-01,KC,NYY,6,Top,5,1,5,0,...,0,0,0,True,0,0,0,0,1,0
83,716352,2023-10-01,KC,NYY,6,Top,7,1,5,0,...,0,0,0,True,0,0,0,0,0,1
86,716352,2023-10-01,KC,NYY,6,Top,2,2,5,2,...,0,0,0,True,0,0,0,0,0,1
89,716352,2023-10-01,KC,NYY,6,Top,4,2,5,0,...,0,0,0,True,0,0,0,0,1,0


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
8,744795,2024-09-25,WSH,KC,3,Bot,1,1,0,1,...,0,0,0,True,0,0,0,0,0,1
9,744795,2024-09-25,WSH,KC,3,Bot,5,1,0,1,...,0,0,0,True,0,1,0,0,0,0
10,744795,2024-09-25,WSH,KC,3,Bot,1,2,0,1,...,0,0,0,True,0,0,0,0,1,0
11,744795,2024-09-25,WSH,KC,3,Bot,4,2,0,1,...,0,0,0,True,0,0,0,0,0,1
12,744795,2024-09-25,WSH,KC,4,Bot,4,0,0,1,...,0,0,0,True,0,0,0,0,0,1


Unnamed: 0,game_id,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,...,is_starter,starter_full_game,did_not_end_pa,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
58,776135,2025-09-28,LAA,HOU,4,Bot,5,0,1,1,...,0,0,0,True,0,0,0,1,0,1
63,776135,2025-09-28,LAA,HOU,4,Bot,5,1,1,1,...,0,0,0,True,0,0,0,0,0,1
65,776135,2025-09-28,LAA,HOU,4,Bot,2,2,1,1,...,0,0,0,True,0,0,0,0,0,1
69,776135,2025-09-28,LAA,HOU,5,Bot,4,0,1,5,...,0,0,0,True,0,0,0,1,0,1
73,776135,2025-09-28,LAA,HOU,5,Bot,4,1,1,5,...,0,0,0,True,0,0,0,0,0,1


### Aggregating Pitching Game Lines

To calculate pitching features, we first aggregate the pitch-level (plate appearance) data to a **game-level pitching line**. This step converts many plate appearance rows into a single summary row per game, containing the core inputs needed for rolling metrics (IP, H, BB, HBP, K, HR).

We use two aggregation levels depending on the feature set: for **starting pitchers**, we aggregate at the individual pitcher-by-game level, while for the **bullpen**, we aggregate at the team-by-game level. Creating these game lines makes it straightforward to compute rolling 3-day and 7-day pitching metrics and then compare home vs. away performance.


In [34]:
for year in range(2021, 2026):
    globals()[f"starter_lines_{year}"] = aggregate_pitching_game_lines(
        globals()[f"pa_starter_{year}"],
        pitcher_id_col="pitcher_name"
    )

for year in range(2021, 2026):
    globals()[f"bullpen_lines_{year}"] = aggregate_pitching_game_lines(
        globals()[f"pa_bullpen_{year}"]
    )

In [35]:
display(HTML("<h4>Starting Pitcher Season 2021</h4>")); display(starter_lines_2021.head(5))
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(starter_lines_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(starter_lines_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(starter_lines_2024.head(5))
display(HTML("<h4>Starting Pitcher Season 2025</h4>")); display(starter_lines_2025.head(5))
print(" ")
display(HTML("<h4>Bullpen Pitcher Season 2021</h4>")); display(bullpen_lines_2021.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2022</h4>")); display(bullpen_lines_2022.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2023</h4>")); display(bullpen_lines_2023.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2024</h4>")); display(bullpen_lines_2024.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2025</h4>")); display(bullpen_lines_2025.head(5))


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,K,HR,did_not_end_pa,starter_full_game
0,634615,2021-04-01,COL,COL,LAD,1,starter,"Márquez, Germán",3.666667,6,6,0,2,0,0,0
1,634615,2021-04-01,LAD,COL,LAD,0,starter,"Kershaw, Clayton",5.333333,10,1,0,2,0,0,0
2,634618,2021-04-01,AZ,SD,AZ,0,starter,"Bumgarner, Madison",4.0,7,3,1,6,2,0,0
3,634618,2021-04-01,SD,SD,AZ,1,starter,"Darvish, Yu",4.666667,8,1,0,6,2,0,0
4,634622,2021-04-01,ATL,PHI,ATL,0,starter,"Fried, Max",4.666667,6,2,1,8,0,0,0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,K,HR,did_not_end_pa,starter_full_game
0,661042,2022-04-07,HOU,LAA,HOU,0,starter,"Valdez, Framber",6.666667,2.0,1.0,0.0,6.0,0.0,0,0
1,661042,2022-04-07,LAA,LAA,HOU,1,starter,"Ohtani, Shohei",4.666667,4.0,1.0,0.0,9.0,0.0,0,0
2,661577,2022-04-07,ATL,ATL,CIN,1,starter,"Fried, Max",5.666667,8.0,1.0,1.0,5.0,0.0,0,0
3,661577,2022-04-07,CIN,ATL,CIN,0,starter,"Mahle, Tyler",5.0,3.0,2.0,0.0,7.0,0.0,0,0
4,662021,2022-04-07,PIT,STL,PIT,0,starter,"Brubaker, JT",3.0,4.0,3.0,0.0,2.0,1.0,0,0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,K,HR,did_not_end_pa,starter_full_game
0,718767,2023-03-30,CLE,SEA,CLE,0,starter,"Bieber, Shane",6.0,6,0,0,3,0,0,0
1,718767,2023-03-30,SEA,SEA,CLE,1,starter,"Castillo, Luis",6.0,1,0,0,6,0,0,0
2,718768,2023-03-30,CWS,HOU,CWS,0,starter,"Cease, Dylan",6.333333,2,0,1,10,0,0,0
3,718768,2023-03-30,HOU,HOU,CWS,1,starter,"Valdez, Framber",5.0,6,0,1,4,0,0,0
4,718769,2023-03-30,ATH,ATH,LAA,1,starter,"Muller, Kyle",5.0,4,1,0,3,0,0,0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,K,HR,did_not_end_pa,starter_full_game
0,745444,2024-03-20,LAD,SD,LAD,0,starter,"Glasnow, Tyler",5.0,2.0,4.0,0.0,3.0,0.0,0,0
1,745444,2024-03-20,SD,SD,LAD,1,starter,"Darvish, Yu",3.666667,2.0,3.0,0.0,3.0,0.0,0,0
2,746175,2024-03-21,LAD,LAD,SD,1,starter,"Yamamoto, Yoshinobu",1.0,4.0,1.0,1.0,2.0,0.0,0,0
3,746175,2024-03-21,SD,LAD,SD,0,starter,"Musgrove, Joe",2.666667,7.0,2.0,0.0,2.0,0.0,0,0
4,745039,2024-03-28,CHC,TEX,CHC,0,starter,"Steele, Justin",4.666667,3.0,1.0,0.0,6.0,0.0,0,0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,K,HR,did_not_end_pa,starter_full_game
0,778563,2025-03-18,CHC,CHC,LAD,1,starter,"Imanaga, Shota",4.0,0,4,0,2,0,0,0
1,778563,2025-03-18,LAD,CHC,LAD,0,starter,"Yamamoto, Yoshinobu",5.0,3,1,0,4,0,0,0
2,778564,2025-03-19,CHC,CHC,LAD,1,starter,"Steele, Justin",4.0,5,1,0,5,2,0,0
3,778564,2025-03-19,LAD,CHC,LAD,0,starter,"Sasaki, Roki",3.0,1,5,0,3,0,0,0
4,778545,2025-03-27,ATL,SD,ATL,0,starter,"Sale, Chris",5.0,6,1,0,7,0,0,0


 


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,HR,did_not_end_pa,starter_full_game
0,634615,2021-04-01,COL,COL,LAD,1,bullpen,4.666667,9,2,1,4,0,0,0
1,634615,2021-04-01,LAD,COL,LAD,0,bullpen,2.333333,1,2,0,2,0,0,0
2,634618,2021-04-01,AZ,SD,AZ,0,bullpen,4.0,3,2,0,4,0,0,0
3,634618,2021-04-01,SD,SD,AZ,1,bullpen,4.333333,4,0,0,6,2,0,0
4,634622,2021-04-01,ATL,PHI,ATL,0,bullpen,4.666667,3,2,0,5,0,0,0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,HR,did_not_end_pa,starter_full_game
0,661042,2022-04-07,HOU,LAA,HOU,0,bullpen,2.333333,2,0,1,1,0,0,0
1,661042,2022-04-07,LAA,LAA,HOU,1,bullpen,4.333333,4,2,0,1,2,0,0
2,661577,2022-04-07,ATL,ATL,CIN,1,bullpen,3.333333,2,0,0,8,1,0,0
3,661577,2022-04-07,CIN,ATL,CIN,0,bullpen,4.0,1,3,0,6,1,0,0
4,662021,2022-04-07,PIT,STL,PIT,0,bullpen,5.0,4,4,1,3,2,0,0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,HR,did_not_end_pa,starter_full_game
0,718767,2023-03-30,CLE,SEA,CLE,0,bullpen,2.0,1,1,1,2,1,0,0
1,718767,2023-03-30,SEA,SEA,CLE,1,bullpen,3.0,3,0,0,3,0,0,0
2,718768,2023-03-30,CWS,HOU,CWS,0,bullpen,2.666667,2,2,0,3,1,0,0
3,718768,2023-03-30,HOU,HOU,CWS,1,bullpen,3.666667,5,1,0,5,1,0,0
4,718769,2023-03-30,ATH,ATH,LAA,1,bullpen,4.0,1,3,0,5,0,0,0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,HR,did_not_end_pa,starter_full_game
0,745444,2024-03-20,LAD,SD,LAD,0,bullpen,4.0,2,0,0,3,0,0,0
1,745444,2024-03-20,SD,SD,LAD,1,bullpen,5.333333,5,6,1,4,0,0,0
2,746175,2024-03-21,LAD,LAD,SD,1,bullpen,8.0,14,5,1,6,1,0,0
3,746175,2024-03-21,SD,LAD,SD,0,bullpen,6.333333,9,4,1,7,1,0,0
4,745039,2024-03-28,CHC,TEX,CHC,0,bullpen,5.0,5,5,0,4,2,0,0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,HR,did_not_end_pa,starter_full_game
0,778563,2025-03-18,CHC,CHC,LAD,1,bullpen,5.0,7,4,0,7,0,0,0
1,778563,2025-03-18,LAD,CHC,LAD,0,bullpen,4.0,0,0,1,5,0,0,0
2,778564,2025-03-19,CHC,CHC,LAD,1,bullpen,5.0,2,6,0,4,1,0,0
3,778564,2025-03-19,LAD,CHC,LAD,0,bullpen,6.0,7,2,0,9,0,0,0
4,778545,2025-03-27,ATL,SD,ATL,0,bullpen,3.0,4,1,0,2,1,0,0


### Starter Coverage Validation

After splitting the data into starter and bullpen tables, we validate that each game contains exactly two starters—one for each team. This step ensures that no starting pitcher was inadvertently excluded during filtering.

Because the plate appearance dataset only retains rows with valid PA-ending events (`is_pa_countable == True`), a starter who exits before completing a full plate appearance could be missing from the filtered data. By summarizing complete-game occurrences and confirming two pitching teams per game, we verify structural integrity before proceeding with aggregation and merging.

In [36]:
starter_lines_by_year = {
    2022: starter_lines_2022,
    2023: starter_lines_2023,
    2024: starter_lines_2024,
    2025: starter_lines_2025,
}

validate_starter_lines_by_year(starter_lines_by_year, strict=True)


=== 2022 starter_lines ===
rows: 4,860 | unique games: 2,430 | expected rows (2*games): 4,860
pitcher_name NaNs: 0
games with starter rowcount != 2: 0

=== 2023 starter_lines ===
rows: 4,860 | unique games: 2,430 | expected rows (2*games): 4,860
pitcher_name NaNs: 0
games with starter rowcount != 2: 0

=== 2024 starter_lines ===
rows: 4,858 | unique games: 2,429 | expected rows (2*games): 4,858
pitcher_name NaNs: 0
games with starter rowcount != 2: 0

=== 2025 starter_lines ===
rows: 4,860 | unique games: 2,430 | expected rows (2*games): 4,860
pitcher_name NaNs: 0
games with starter rowcount != 2: 0


In [37]:
#Validation

for y in range(2022, 2026):
    df = globals()[f"pa_{y}"]
    n = int(df["did_not_end_pa"].sum()) if "did_not_end_pa" in df.columns else 0
    games = df.loc[df.get("did_not_end_pa", 0) == 1, "game_id"].nunique() if "did_not_end_pa" in df.columns else 0
    print(f"{y}: did_not_end_pa rows = {n:,} across {games} games")
    

2022: did_not_end_pa rows = 0 across 0 games
2023: did_not_end_pa rows = 0 across 0 games
2024: did_not_end_pa rows = 0 across 0 games
2025: did_not_end_pa rows = 0 across 0 games


### Copying Dataframes

Below, I will make copies of `starter_lines_yyyy` and `bullpen_lines_yyyy`. These will be used for helping fill in missing data.

In [38]:
for y in range(2021, 2026):
    starters = globals()[f"starter_lines_{y}"].copy()
    bullpen  = globals()[f"bullpen_lines_{y}"].copy()

    # drop any columns that start with "roll"
    starters = starters.loc[:, ~starters.columns.str.startswith("roll")]
    bullpen  = bullpen.loc[:, ~bullpen.columns.str.startswith("roll")]

    globals()[f"starters_{y}"] = starters
    globals()[f"bullpen_{y}"] = bullpen

### Rolling Pitching Counts

Next, we compute rolling **3-day** and **7-day** totals for the pitching statistics needed to construct our rate metrics (IP, H, BB, HBP, K, HR). We calculate these rolling counts at different levels depending on pitcher role: for **starters**, we roll at the individual pitcher level, while for the **bullpen**, we roll at the team level. This produces time-based aggregates using only prior games, which are later used to compute rolling FIP, WHIP, K/9, and HR/9.


In [39]:
for year in range(2021, 2026):
    name = f"starter_lines_{year}"
    df = globals()[name].copy()

    if "pitcher_role" not in df.columns:
        df["pitcher_role"] = "starter"

    globals()[name] = add_rolling_pitching_counts(
        df,
        pitcher_col="pitcher_name"
    )

for year in range(2021, 2026):
    name = f"bullpen_lines_{year}"
    globals()[name] = add_rolling_pitching_counts(globals()[name])


  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)


In [40]:
display(HTML("<h4>Starting Pitcher Season 2021</h4>")); display(starter_lines_2021.head(5))
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(starter_lines_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(starter_lines_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(starter_lines_2024.head(5))
display(HTML("<h4>Starting Pitcher Season 2025</h4>")); display(starter_lines_2025.head(5))
print(" ")
display(HTML("<h4>Bullpen Pitcher Season 2021</h4>")); display(bullpen_lines_2021.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2022</h4>")); display(bullpen_lines_2022.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2023</h4>")); display(bullpen_lines_2023.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2024</h4>")); display(bullpen_lines_2024.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2025</h4>")); display(bullpen_lines_2025.head(5))


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,...,roll_3D_starter_BB,roll_3D_starter_HBP,roll_3D_starter_K,roll_3D_starter_HR,roll_7D_starter_IP,roll_7D_starter_H,roll_7D_starter_BB,roll_7D_starter_HBP,roll_7D_starter_K,roll_7D_starter_HR
0,632266,2021-10-01,CHC,STL,CHC,0,starter,"Abbott, Cory",5.0,4,...,,,,,,,,,,
1,632234,2021-10-03,WSH,WSH,BOS,1,starter,"Adon, Joan",5.0,6,...,,,,,,,,,,
2,633872,2021-05-30,BAL,CWS,BAL,0,starter,"Akin, Keegan",4.666667,5,...,,,,,,,,,,
3,633818,2021-06-04,BAL,BAL,CLE,1,starter,"Akin, Keegan",5.0,3,...,2.0,0.0,4.0,1.0,4.666667,5.0,2.0,0.0,4.0,1.0
4,633781,2021-06-11,BAL,TB,BAL,0,starter,"Akin, Keegan",3.666667,5,...,1.0,0.0,4.0,0.0,9.666667,8.0,3.0,0.0,8.0,1.0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,...,roll_3D_starter_BB,roll_3D_starter_HBP,roll_3D_starter_K,roll_3D_starter_HR,roll_7D_starter_IP,roll_7D_starter_H,roll_7D_starter_BB,roll_7D_starter_HBP,roll_7D_starter_K,roll_7D_starter_HR
0,662529,2022-08-02,WSH,WSH,NYM,1,starter,"Abbott, Cory",5.0,2.0,...,,,,,,,,,,
1,661558,2022-08-07,WSH,PHI,WSH,0,starter,"Abbott, Cory",3.333333,7.0,...,2.0,1.0,3.0,0.0,5.0,2.0,2.0,1.0,3.0,0.0
2,662551,2022-08-12,WSH,WSH,SD,1,starter,"Abbott, Cory",4.0,4.0,...,5.0,1.0,2.0,4.0,8.333333,9.0,7.0,2.0,5.0,4.0
3,662470,2022-08-17,WSH,WSH,CHC,1,starter,"Abbott, Cory",6.0,3.0,...,3.0,0.0,5.0,0.0,7.333333,11.0,8.0,1.0,7.0,4.0
4,661924,2022-09-07,WSH,STL,WSH,0,starter,"Abbott, Cory",4.0,5.0,...,1.0,0.0,5.0,1.0,10.0,7.0,4.0,0.0,10.0,1.0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,...,roll_3D_starter_BB,roll_3D_starter_HBP,roll_3D_starter_K,roll_3D_starter_HR,roll_7D_starter_IP,roll_7D_starter_H,roll_7D_starter_BB,roll_7D_starter_HBP,roll_7D_starter_K,roll_7D_starter_HR
0,717883,2023-06-05,CIN,CIN,MIL,1,starter,"Abbott, Andrew",6.0,1,...,,,,,,,,,,
1,717817,2023-06-10,CIN,STL,CIN,0,starter,"Abbott, Andrew",5.666667,5,...,4.0,0.0,6.0,0.0,6.0,1.0,4.0,0.0,6.0,0.0
2,717737,2023-06-16,CIN,HOU,CIN,0,starter,"Abbott, Andrew",6.0,4,...,3.0,0.0,4.0,0.0,11.666667,6.0,7.0,0.0,10.0,0.0
3,717669,2023-06-21,CIN,CIN,COL,1,starter,"Abbott, Andrew",6.0,4,...,2.0,0.0,2.0,0.0,11.666667,9.0,5.0,0.0,6.0,0.0
4,717601,2023-06-27,CIN,BAL,CIN,0,starter,"Abbott, Andrew",6.0,2,...,0.0,0.0,10.0,3.0,12.0,8.0,2.0,0.0,12.0,3.0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,...,roll_3D_starter_BB,roll_3D_starter_HBP,roll_3D_starter_K,roll_3D_starter_HR,roll_7D_starter_IP,roll_7D_starter_H,roll_7D_starter_BB,roll_7D_starter_HBP,roll_7D_starter_K,roll_7D_starter_HR
0,745600,2024-04-01,CIN,PHI,CIN,0,starter,"Abbott, Andrew",5.333333,3.0,...,,,,,,,,,,
1,746734,2024-04-07,CIN,CIN,NYM,1,starter,"Abbott, Andrew",5.0,7.0,...,2.0,0.0,4.0,0.0,5.333333,3.0,2.0,0.0,4.0,0.0
2,746812,2024-04-12,CIN,CWS,CIN,0,starter,"Abbott, Andrew",7.0,4.0,...,2.0,1.0,4.0,1.0,10.333333,10.0,4.0,1.0,8.0,1.0
3,745267,2024-04-17,CIN,SEA,CIN,0,starter,"Abbott, Andrew",6.0,4.0,...,0.0,0.0,3.0,0.0,12.0,11.0,2.0,1.0,7.0,1.0
4,746722,2024-04-23,CIN,CIN,PHI,1,starter,"Abbott, Andrew",4.333333,2.0,...,3.0,0.0,6.0,2.0,13.0,8.0,3.0,0.0,9.0,2.0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,...,roll_3D_starter_BB,roll_3D_starter_HBP,roll_3D_starter_K,roll_3D_starter_HR,roll_7D_starter_IP,roll_7D_starter_H,roll_7D_starter_BB,roll_7D_starter_HBP,roll_7D_starter_K,roll_7D_starter_HR
0,778357,2025-04-12,CIN,CIN,PIT,1,starter,"Abbott, Andrew",5.0,2,...,,,,,,,,,,
1,778276,2025-04-18,CIN,BAL,CIN,0,starter,"Abbott, Andrew",6.0,2,...,2.0,0.0,5.0,1.0,5.0,2.0,2.0,0.0,5.0,1.0
2,778181,2025-04-25,CIN,COL,CIN,0,starter,"Abbott, Andrew",4.0,5,...,1.0,0.0,11.0,1.0,11.0,4.0,3.0,0.0,16.0,2.0
3,778107,2025-05-01,CIN,CIN,STL,1,starter,"Abbott, Andrew",4.0,3,...,5.0,0.0,4.0,1.0,4.0,5.0,5.0,0.0,4.0,1.0
4,778034,2025-05-06,CIN,ATL,CIN,0,starter,"Abbott, Andrew",5.0,4,...,4.0,0.0,3.0,0.0,8.0,8.0,9.0,0.0,7.0,1.0


 


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,...,roll_3D_bullpen_BB,roll_3D_bullpen_HBP,roll_3D_bullpen_K,roll_3D_bullpen_HR,roll_7D_bullpen_IP,roll_7D_bullpen_H,roll_7D_bullpen_BB,roll_7D_bullpen_HBP,roll_7D_bullpen_K,roll_7D_bullpen_HR
0,634640,2021-04-01,ATH,ATH,HOU,1,bullpen,3.666667,5,4,...,,,,,,,,,,
1,634605,2021-04-02,ATH,ATH,HOU,1,bullpen,4.0,6,4,...,4.0,1.0,2.0,2.0,3.666667,5.0,4.0,1.0,2.0,2.0
2,634629,2021-04-03,ATH,ATH,HOU,1,bullpen,4.666667,6,1,...,8.0,1.0,5.0,2.0,7.666667,11.0,8.0,1.0,5.0,2.0
3,634651,2021-04-04,ATH,ATH,HOU,1,bullpen,4.333333,5,1,...,9.0,1.0,9.0,3.0,12.333333,17.0,9.0,1.0,9.0,3.0
4,634600,2021-04-05,ATH,ATH,LAD,1,bullpen,6.333333,7,5,...,6.0,0.0,8.0,2.0,16.666667,22.0,10.0,1.0,10.0,4.0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,...,roll_3D_bullpen_BB,roll_3D_bullpen_HBP,roll_3D_bullpen_K,roll_3D_bullpen_HR,roll_7D_bullpen_IP,roll_7D_bullpen_H,roll_7D_bullpen_BB,roll_7D_bullpen_HBP,roll_7D_bullpen_K,roll_7D_bullpen_HR
0,661131,2022-04-08,ATH,PHI,ATH,0,bullpen,2.666667,5,4,...,,,,,,,,,,
1,661130,2022-04-09,ATH,PHI,ATH,0,bullpen,2.333333,0,3,...,4.0,0.0,2.0,0.0,2.666667,5.0,4.0,0.0,2.0,0.0
2,661129,2022-04-10,ATH,PHI,ATH,0,bullpen,4.0,1,1,...,7.0,0.0,5.0,0.0,5.0,5.0,7.0,0.0,5.0,0.0
3,661915,2022-04-11,ATH,TB,ATH,0,bullpen,4.0,5,0,...,8.0,0.0,10.0,1.0,9.0,6.0,8.0,0.0,10.0,1.0
4,661944,2022-04-12,ATH,TB,ATH,0,bullpen,8.0,8,5,...,4.0,0.0,14.0,1.0,13.0,11.0,8.0,0.0,16.0,1.0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,...,roll_3D_bullpen_BB,roll_3D_bullpen_HBP,roll_3D_bullpen_K,roll_3D_bullpen_HR,roll_7D_bullpen_IP,roll_7D_bullpen_H,roll_7D_bullpen_BB,roll_7D_bullpen_HBP,roll_7D_bullpen_K,roll_7D_bullpen_HR
0,718769,2023-03-30,ATH,ATH,LAA,1,bullpen,4.0,1,3,...,,,,,,,,,,
1,718757,2023-04-01,ATH,ATH,LAA,1,bullpen,6.666667,6,3,...,3.0,0.0,5.0,0.0,4.0,1.0,3.0,0.0,5.0,0.0
2,718734,2023-04-02,ATH,ATH,LAA,1,bullpen,3.333333,2,1,...,6.0,0.0,8.0,1.0,10.666667,7.0,6.0,0.0,8.0,1.0
3,718721,2023-04-03,ATH,ATH,CLE,1,bullpen,5.0,9,2,...,4.0,0.0,5.0,1.0,14.0,9.0,7.0,0.0,10.0,1.0
4,718707,2023-04-04,ATH,ATH,CLE,1,bullpen,4.333333,0,2,...,6.0,1.0,8.0,1.0,19.0,18.0,9.0,1.0,13.0,1.0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,...,roll_3D_bullpen_BB,roll_3D_bullpen_HBP,roll_3D_bullpen_K,roll_3D_bullpen_HR,roll_7D_bullpen_IP,roll_7D_bullpen_H,roll_7D_bullpen_BB,roll_7D_bullpen_HBP,roll_7D_bullpen_K,roll_7D_bullpen_HR
0,745687,2024-03-28,ATH,ATH,CLE,1,bullpen,5.333333,4,2,...,,,,,,,,,,
1,745682,2024-03-29,ATH,ATH,CLE,1,bullpen,3.666667,2,4,...,2.0,2.0,4.0,0.0,5.333333,4.0,2.0,2.0,4.0,0.0
2,745684,2024-03-30,ATH,ATH,CLE,1,bullpen,5.333333,9,3,...,6.0,2.0,7.0,0.0,9.0,6.0,6.0,2.0,7.0,0.0
3,745683,2024-03-31,ATH,ATH,CLE,1,bullpen,1.666667,4,1,...,9.0,3.0,12.0,1.0,14.333333,15.0,9.0,3.0,12.0,1.0
4,745675,2024-04-01,ATH,ATH,BOS,1,bullpen,6.333333,1,3,...,8.0,2.0,10.0,1.0,16.0,19.0,10.0,4.0,14.0,1.0


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,...,roll_3D_bullpen_BB,roll_3D_bullpen_HBP,roll_3D_bullpen_K,roll_3D_bullpen_HR,roll_7D_bullpen_IP,roll_7D_bullpen_H,roll_7D_bullpen_BB,roll_7D_bullpen_HBP,roll_7D_bullpen_K,roll_7D_bullpen_HR
0,778547,2025-03-27,ATH,SEA,ATH,0,bullpen,1.666667,2,3,...,,,,,,,,,,
1,778541,2025-03-28,ATH,SEA,ATH,0,bullpen,3.0,2,2,...,3.0,0.0,1.0,2.0,1.666667,2.0,3.0,0.0,1.0,2.0
2,778521,2025-03-29,ATH,SEA,ATH,0,bullpen,4.0,2,2,...,5.0,0.0,5.0,2.0,4.666667,4.0,5.0,0.0,5.0,2.0
3,778513,2025-03-30,ATH,SEA,ATH,0,bullpen,1.0,0,0,...,7.0,0.0,13.0,2.0,8.666667,6.0,7.0,0.0,13.0,2.0
4,778501,2025-03-31,ATH,ATH,CHC,1,bullpen,5.0,12,6,...,4.0,0.0,14.0,0.0,9.666667,6.0,7.0,0.0,15.0,2.0


### Rolling Pitching Rate Metrics

After computing rolling 3-day and 7-day **count** totals (IP, H, BB, HBP, K, HR), we can now convert these aggregates into rolling **rate-based** pitching metrics: **WHIP**, **K/9**, **HR/9**, and **FIP**. Importantly, these rates are calculated from the rolled sums (rather than rolling the ratios directly), and the logic applies to both **starting pitchers** (pitcher-level rolling) and the **bullpen** (team-level rolling).

**NOTE**: Excludes the calcualtion when `did_not_end_pa==1`, meaning that the starting pitcher 


**NOTE TO SELF**: Double check it is rolling correclty

In [41]:
for year in range(2021, 2026):
    name = f"starter_lines_{year}"
    globals()[name] = add_rate_metrics_from_rolled_counts(
        globals()[name],
        windows=("3D", "7D")
    )

for year in range(2021, 2026):
    name = f"bullpen_lines_{year}"
    globals()[name] = add_rate_metrics_from_rolled_counts(
        globals()[name],
        windows=("3D", "7D")
    )


In [42]:
display(HTML("<h4>Starting Pitcher Season 2021</h4>")); display(starter_lines_2021.head(5))
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(starter_lines_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(starter_lines_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(starter_lines_2024.head(5))
display(HTML("<h4>Starting Pitcher Season 2025</h4>")); display(starter_lines_2025.head(5))
print(" ")
display(HTML("<h4>Bullpen Pitcher Season 2021</h4>")); display(bullpen_lines_2021.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2022</h4>")); display(bullpen_lines_2022.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2023</h4>")); display(bullpen_lines_2023.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2024</h4>")); display(bullpen_lines_2024.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2025</h4>")); display(bullpen_lines_2025.head(5))


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,...,roll_7D_starter_K,roll_7D_starter_HR,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,632266,2021-10-01,CHC,STL,CHC,0,starter,"Abbott, Cory",5.0,4,...,,,,,,,,,,
1,632234,2021-10-03,WSH,WSH,BOS,1,starter,"Adon, Joan",5.0,6,...,,,,,,,,,,
2,633872,2021-05-30,BAL,CWS,BAL,0,starter,"Akin, Keegan",4.666667,5,...,,,,,,,,,,
3,633818,2021-06-04,BAL,BAL,CLE,1,starter,"Akin, Keegan",5.0,3,...,4.0,1.0,1.5,7.714286,1.928571,2.357143,1.5,7.714286,1.928571,2.357143
4,633781,2021-06-11,BAL,TB,BAL,0,starter,"Akin, Keegan",3.666667,5,...,8.0,1.0,0.8,7.2,0.0,-1.0,1.137931,7.448276,0.931034,0.62069


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,...,roll_7D_starter_K,roll_7D_starter_HR,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,662529,2022-08-02,WSH,WSH,NYM,1,starter,"Abbott, Cory",5.0,2.0,...,,,,,,,,,,
1,661558,2022-08-07,WSH,PHI,WSH,0,starter,"Abbott, Cory",3.333333,7.0,...,3.0,0.0,1.0,5.4,0.0,0.6,1.0,5.4,0.0,0.6
2,662551,2022-08-12,WSH,WSH,SD,1,starter,"Abbott, Cory",4.0,4.0,...,5.0,4.0,3.9,5.4,10.8,19.8,2.16,5.4,4.32,8.28
3,662470,2022-08-17,WSH,WSH,CHC,1,starter,"Abbott, Cory",6.0,3.0,...,7.0,4.0,1.75,11.25,0.0,-0.25,2.727273,8.590909,4.909091,8.863636
4,661924,2022-09-07,WSH,STL,WSH,0,starter,"Abbott, Cory",4.0,5.0,...,10.0,1.0,0.666667,7.5,1.5,1.0,1.1,9.0,0.9,0.5


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,...,roll_7D_starter_K,roll_7D_starter_HR,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,717883,2023-06-05,CIN,CIN,MIL,1,starter,"Abbott, Andrew",6.0,1,...,,,,,,,,,,
1,717817,2023-06-10,CIN,STL,CIN,0,starter,"Abbott, Andrew",5.666667,5,...,6.0,0.0,0.833333,9.0,0.0,0.0,0.833333,9.0,0.0,0.0
2,717737,2023-06-16,CIN,HOU,CIN,0,starter,"Abbott, Andrew",6.0,4,...,10.0,0.0,1.411765,6.352941,0.0,0.176471,1.114286,7.714286,0.0,0.085714
3,717669,2023-06-21,CIN,CIN,COL,1,starter,"Abbott, Andrew",6.0,4,...,6.0,0.0,1.0,3.0,0.0,0.333333,1.2,4.628571,0.0,0.257143
4,717601,2023-06-27,CIN,BAL,CIN,0,starter,"Abbott, Andrew",6.0,2,...,12.0,3.0,0.666667,15.0,4.5,3.166667,0.833333,9.0,2.25,1.75


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,...,roll_7D_starter_K,roll_7D_starter_HR,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,745600,2024-04-01,CIN,PHI,CIN,0,starter,"Abbott, Andrew",5.333333,3.0,...,,,,,,,,,,
1,746734,2024-04-07,CIN,CIN,NYM,1,starter,"Abbott, Andrew",5.0,7.0,...,4.0,0.0,0.9375,6.75,0.0,-0.375,0.9375,6.75,0.0,-0.375
2,746812,2024-04-12,CIN,CWS,CIN,0,starter,"Abbott, Andrew",7.0,4.0,...,8.0,1.0,2.0,7.2,1.8,2.8,1.451613,6.967742,0.870968,1.16129
3,745267,2024-04-17,CIN,SEA,CIN,0,starter,"Abbott, Andrew",6.0,4.0,...,7.0,1.0,0.571429,3.857143,0.0,-0.857143,1.166667,5.25,0.75,0.666667
4,746722,2024-04-23,CIN,CIN,PHI,1,starter,"Abbott, Andrew",4.333333,2.0,...,9.0,2.0,1.166667,9.0,3.0,3.833333,0.846154,6.230769,1.384615,1.307692


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,...,roll_7D_starter_K,roll_7D_starter_HR,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,778357,2025-04-12,CIN,CIN,PIT,1,starter,"Abbott, Andrew",5.0,2,...,,,,,,,,,,
1,778276,2025-04-18,CIN,BAL,CIN,0,starter,"Abbott, Andrew",6.0,2,...,5.0,1.0,0.8,9.0,1.8,1.8,0.8,9.0,1.8,1.8
2,778181,2025-04-25,CIN,COL,CIN,0,starter,"Abbott, Andrew",4.0,5,...,16.0,2.0,0.5,16.5,1.5,-1.0,0.636364,13.090909,1.636364,0.272727
3,778107,2025-05-01,CIN,CIN,STL,1,starter,"Abbott, Andrew",4.0,3,...,4.0,1.0,2.5,9.0,2.25,5.0,2.5,9.0,2.25,5.0
4,778034,2025-05-06,CIN,ATL,CIN,0,starter,"Abbott, Andrew",5.0,4,...,7.0,1.0,1.75,6.75,0.0,1.5,2.125,7.875,1.125,3.25


 


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,...,roll_7D_bullpen_K,roll_7D_bullpen_HR,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,634640,2021-04-01,ATH,ATH,HOU,1,bullpen,3.666667,5,4,...,,,,,,,,,,
1,634605,2021-04-02,ATH,ATH,HOU,1,bullpen,4.0,6,4,...,2.0,2.0,2.727273,4.909091,4.909091,10.090909,2.727273,4.909091,4.909091,10.090909
2,634629,2021-04-03,ATH,ATH,HOU,1,bullpen,4.666667,6,1,...,5.0,2.0,2.608696,5.869565,2.347826,5.608696,2.608696,5.869565,2.347826,5.608696
3,634651,2021-04-04,ATH,ATH,HOU,1,bullpen,4.333333,5,1,...,9.0,3.0,2.189189,6.567568,2.189189,4.135135,2.189189,6.567568,2.189189,4.135135
4,634600,2021-04-05,ATH,ATH,LAD,1,bullpen,6.333333,7,5,...,10.0,4.0,1.769231,5.538462,1.384615,2.153846,1.98,5.4,2.16,3.9


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,...,roll_7D_bullpen_K,roll_7D_bullpen_HR,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,661131,2022-04-08,ATH,PHI,ATH,0,bullpen,2.666667,5,4,...,,,,,,,,,,
1,661130,2022-04-09,ATH,PHI,ATH,0,bullpen,2.333333,0,3,...,2.0,0.0,3.375,6.75,0.0,3.0,3.375,6.75,0.0,3.0
2,661129,2022-04-10,ATH,PHI,ATH,0,bullpen,4.0,1,1,...,5.0,0.0,2.4,9.0,0.0,2.2,2.4,9.0,0.0,2.2
3,661915,2022-04-11,ATH,TB,ATH,0,bullpen,4.0,5,0,...,10.0,1.0,1.555556,10.0,1.0,1.888889,1.555556,10.0,1.0,1.888889
4,661944,2022-04-12,ATH,TB,ATH,0,bullpen,8.0,8,5,...,16.0,1.0,0.967742,12.193548,0.870968,-0.290323,1.461538,11.076923,0.692308,0.384615


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,...,roll_7D_bullpen_K,roll_7D_bullpen_HR,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,718769,2023-03-30,ATH,ATH,LAA,1,bullpen,4.0,1,3,...,,,,,,,,,,
1,718757,2023-04-01,ATH,ATH,LAA,1,bullpen,6.666667,6,3,...,5.0,0.0,1.0,11.25,0.0,-0.25,1.0,11.25,0.0,-0.25
2,718734,2023-04-02,ATH,ATH,LAA,1,bullpen,3.333333,2,1,...,8.0,1.0,1.21875,6.75,0.84375,1.40625,1.21875,6.75,0.84375,1.40625
3,718721,2023-04-03,ATH,ATH,CLE,1,bullpen,5.0,9,2,...,10.0,1.0,1.2,4.5,0.9,1.5,1.142857,6.428571,0.642857,1.0
4,718707,2023-04-04,ATH,ATH,CLE,1,bullpen,4.333333,0,2,...,13.0,1.0,1.6,4.8,0.6,1.2,1.473684,6.157895,0.473684,0.894737


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,...,roll_7D_bullpen_K,roll_7D_bullpen_HR,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,745687,2024-03-28,ATH,ATH,CLE,1,bullpen,5.333333,4,2,...,,,,,,,,,,
1,745682,2024-03-29,ATH,ATH,CLE,1,bullpen,3.666667,2,4,...,4.0,0.0,1.5,6.75,0.0,0.75,1.5,6.75,0.0,0.75
2,745684,2024-03-30,ATH,ATH,CLE,1,bullpen,5.333333,9,3,...,7.0,0.0,1.555556,7.0,0.0,1.111111,1.555556,7.0,0.0,1.111111
3,745683,2024-03-31,ATH,ATH,CLE,1,bullpen,1.666667,4,1,...,12.0,1.0,1.883721,7.534884,0.627907,1.744186,1.883721,7.534884,0.627907,1.744186
4,745675,2024-04-01,ATH,ATH,BOS,1,bullpen,6.333333,1,3,...,14.0,1.0,2.34375,8.4375,0.84375,2.15625,2.0625,7.875,0.5625,1.6875


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,...,roll_7D_bullpen_K,roll_7D_bullpen_HR,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,778547,2025-03-27,ATH,SEA,ATH,0,bullpen,1.666667,2,3,...,,,,,,,,,,
1,778541,2025-03-28,ATH,SEA,ATH,0,bullpen,3.0,2,2,...,1.0,2.0,3.0,5.4,10.8,19.8,3.0,5.4,10.8,19.8
2,778521,2025-03-29,ATH,SEA,ATH,0,bullpen,4.0,2,2,...,5.0,2.0,1.928571,9.642857,3.857143,6.642857,1.928571,9.642857,3.857143,6.642857
3,778513,2025-03-30,ATH,SEA,ATH,0,bullpen,1.0,0,0,...,13.0,2.0,1.5,13.5,2.076923,2.423077,1.5,13.5,2.076923,2.423077
4,778501,2025-03-31,ATH,ATH,CHC,1,bullpen,5.0,12,6,...,15.0,2.0,1.0,15.75,0.0,-2.0,1.344828,13.965517,1.862069,1.758621


### Dropping

In [43]:
for year in range(2022, 2026):
    st_name = f"starter_lines_{year}"
    bp_name = f"bullpen_lines_{year}"

    if st_name in globals():
        globals()[st_name] = drop_rolled_component_cols(globals()[st_name])
        print(f"{year}: dropped rolled components from {st_name}")

    if bp_name in globals():
        globals()[bp_name] = drop_rolled_component_cols(globals()[bp_name])
        print(f"{year}: dropped rolled components from {bp_name}")

2022: dropped rolled components from starter_lines_2022
2022: dropped rolled components from bullpen_lines_2022
2023: dropped rolled components from starter_lines_2023
2023: dropped rolled components from bullpen_lines_2023
2024: dropped rolled components from starter_lines_2024
2024: dropped rolled components from bullpen_lines_2024
2025: dropped rolled components from starter_lines_2025
2025: dropped rolled components from bullpen_lines_2025


## Missing Data

Before combining the starting pitcher and bullpen dataframes for each season, we need to address the issue of missing data. Because all rolling features are computed using prior games only, the first game(s) of each season naturally contain missing values. These missing values are structural and arise from the absence of historical data within the rolling window.

To handle this, I impute missing values using information from the previous season only, avoiding any leakage from the current season.

**Starting Pitchers**
1. If the starting pitcher appeared in the previous season, I impute using that pitcher’s previous-season average.
2. If the pitcher did not appear in the previous season, I impute using the league-wide average across all starting pitchers.

**Bullpen**
- Bullpen features are imputed using the team’s previous-season bullpen average, reflecting bullpen performance as a team-level construct.



### Summarizing Pitcher Rates

Now, we will aggregate prior-season pitching performance into a structured summary table that serves as the basis for imputation. The underlying function `summarize_pitching_rates` groups the data either by starting pitcher or by team (for bullpens), sums innings pitched and key counting statistics across the season, and then computes rate metrics from those totals to ensure they are properly innings-weighted. It also generates an overall league-level row that represents combined performance across all starters or all bullpens, which can be used as a fallback baseline when a pitcher did not appear in the previous season. The resulting table provides consistent, totals-based season averages that align with our no-leakage framework and support principled imputation of early-season missing rolling features.

In [44]:
for year in range(2021, 2026):
    globals()[f"starter_summary_{year}"] = summarize_pitching_rates(
        globals()[f"starters_{year}"],
        kind="starter",
    )

    globals()[f"bullpen_summary_{year}"] = summarize_pitching_rates(
        globals()[f"bullpen_{year}"],
        kind="bullpen",
    )

In [45]:
display(HTML("<h4>Starting Pitcher Season 2021</h4>")); display(starter_summary_2021.head(5))
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(starter_summary_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(starter_summary_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(starter_summary_2024.head(5))
display(HTML("<h4>Starting Pitcher Season 2025</h4>")); display(starter_summary_2025.head(5))
print(" ")
display(HTML("<h4>Bullpen Pitcher Season 2021</h4>")); display(bullpen_summary_2021.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2022</h4>")); display(bullpen_summary_2022.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2023</h4>")); display(bullpen_summary_2023.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2024</h4>")); display(bullpen_summary_2024.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2025</h4>")); display(bullpen_summary_2025.head(5))


Unnamed: 0,pitcher_name,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_STARTER,24163.666667,23173.0,8101.0,1100.0,23381.0,3595.0,1.33978,8.708488,1.338994,1.141218,weighted_by_IP (from totals)
1,"Wheeler, Zack",212.666667,169.0,46.0,8.0,247.0,16.0,1.048589,10.452978,0.677116,-0.583072,group_totals
2,"Buehler, Walker",207.0,149.0,52.0,6.0,212.0,19.0,1.0,9.217391,0.826087,-0.014493,group_totals
3,"Wainwright, Adam",204.666667,168.0,50.0,9.0,174.0,21.0,1.109121,7.651466,0.923453,0.498371,group_totals
4,"Alcantara, Sandy",203.0,171.0,50.0,10.0,201.0,21.0,1.137931,8.91133,0.931034,0.251232,group_totals


Unnamed: 0,pitcher_name,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_STARTER,25006.666667,23914.0,7939.0,1095.0,22991.0,3262.0,1.317569,8.274553,1.174007,0.940789,weighted_by_IP (from totals)
1,"Alcantara, Sandy",224.666667,174.0,50.0,9.0,207.0,16.0,1.037092,8.292285,0.64095,-0.12908,group_totals
2,"Nola, Aaron",202.666667,168.0,29.0,9.0,235.0,19.0,1.016447,10.435855,0.84375,-0.537829,group_totals
3,"Burnes, Corbin",200.666667,144.0,51.0,13.0,243.0,23.0,1.036545,10.898671,1.031561,0.024917,group_totals
4,"Valdez, Framber",199.666667,166.0,67.0,11.0,194.0,11.0,1.222037,8.744574,0.495826,-0.055092,group_totals


Unnamed: 0,pitcher_name,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_STARTER,24674.333333,24280.0,8396.0,1065.0,23512.0,3675.0,1.367453,8.576037,1.340462,1.180741,weighted_by_IP (from totals)
1,"Webb, Logan",213.0,201.0,31.0,5.0,194.0,20.0,1.112676,8.197183,0.84507,-0.093897,group_totals
2,"Gallen, Zac",209.0,188.0,47.0,5.0,220.0,22.0,1.148325,9.473684,0.947368,0.009569,group_totals
3,"Cole, Gerrit",206.0,157.0,48.0,7.0,222.0,20.0,1.029126,9.699029,0.873786,-0.092233,group_totals
4,"Mikolas, Miles",198.0,226.0,39.0,8.0,137.0,26.0,1.378788,6.227273,1.181818,1.035354,group_totals


Unnamed: 0,pitcher_name,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_STARTER,25021.0,23934.0,8135.0,1067.0,23492.0,3416.0,1.324328,8.450022,1.228728,1.00036,weighted_by_IP (from totals)
1,"Gilbert, Logan",207.666667,148.0,37.0,4.0,220.0,26.0,0.910112,9.53451,1.126806,0.101124,group_totals
2,"Lugo, Seth",205.0,177.0,48.0,9.0,181.0,16.0,1.141463,7.946341,0.702439,0.082927,group_totals
3,"Webb, Logan",202.0,202.0,50.0,2.0,172.0,11.0,1.257426,7.663366,0.490099,-0.222772,group_totals
4,"Wheeler, Zack",198.666667,139.0,52.0,8.0,224.0,20.0,1.001678,10.147651,0.90604,-0.040268,group_totals


Unnamed: 0,pitcher_name,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_STARTER,24887.666667,23816.0,8297.0,1019.0,23247.0,3507.0,1.331262,8.406694,1.268219,1.086683,weighted_by_IP (from totals)
1,"Webb, Logan",204.0,210.0,46.0,6.0,224.0,14.0,1.284314,9.882353,0.617647,-0.539216,group_totals
2,"Crochet, Garrett",202.0,165.0,46.0,3.0,255.0,24.0,1.059406,11.361386,1.069307,-0.252475,group_totals
3,"Sánchez, Cristopher",200.333333,171.0,44.0,6.0,212.0,12.0,1.103161,9.524126,0.539101,-0.589018,group_totals
4,"Rodón, Carlos",194.0,132.0,73.0,9.0,203.0,22.0,1.103093,9.417526,1.020619,0.649485,group_totals


 


Unnamed: 0,pitching_team,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_BULLPEN,18020.0,16311.0,7693.0,1012.0,18764.0,2349.0,1.388235,9.371587,1.173196,1.061265,weighted_by_IP (from totals)
1,TB,696.666667,584.0,220.0,34.0,739.0,78.0,1.202871,9.54689,1.007656,0.427751,group_totals
2,SD,683.333333,593.0,263.0,50.0,715.0,89.0,1.325854,9.417073,1.172195,0.974634,group_totals
3,BAL,660.0,677.0,296.0,33.0,637.0,109.0,1.524242,8.686364,1.486364,1.712121,group_totals
4,PIT,639.666667,584.0,308.0,38.0,641.0,84.0,1.453882,9.01876,1.181866,1.32569,group_totals


Unnamed: 0,pitching_team,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_BULLPEN,17597.333333,15759.0,6913.0,950.0,17819.0,1953.0,1.342362,9.113369,0.998845,0.758069,weighted_by_IP (from totals)
1,TB,678.0,583.0,213.0,37.0,657.0,83.0,1.228614,8.721239,1.10177,0.759587,group_totals
2,CHC,653.333333,587.0,276.0,41.0,716.0,98.0,1.383673,9.863265,1.35,1.213776,group_totals
3,MIN,647.0,595.0,243.0,30.0,675.0,78.0,1.341577,9.38949,1.085008,0.746522,group_totals
4,PIT,646.0,639.0,296.0,47.0,595.0,70.0,1.520124,8.289474,0.975232,1.159443,group_totals


Unnamed: 0,pitching_team,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_BULLPEN,17914.0,16559.0,7423.0,1047.0,18331.0,2193.0,1.397175,9.209501,1.101764,0.963325,weighted_by_IP (from totals)
1,SF,695.333333,669.0,232.0,35.0,700.0,75.0,1.346117,9.060403,0.970757,0.540748,group_totals
2,ATH,673.0,663.0,368.0,48.0,630.0,82.0,1.603269,8.424963,1.096582,1.566122,group_totals
3,TB,654.333333,549.0,232.0,46.0,657.0,76.0,1.263882,9.036679,1.045339,0.776363,group_totals
4,DET,650.333333,610.0,224.0,28.0,642.0,82.0,1.325474,8.884675,1.134803,0.827268,group_totals


Unnamed: 0,pitching_team,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_BULLPEN,17569.666667,15884.0,6794.0,952.0,17704.0,2037.0,1.344932,9.068812,1.043446,0.814529,weighted_by_IP (from totals)
1,DET,688.333333,578.0,210.0,33.0,624.0,76.0,1.192736,8.158838,0.993705,0.681356,group_totals
2,SF,648.666667,616.0,233.0,34.0,658.0,69.0,1.361254,9.129496,0.957348,0.5889,group_totals
3,MIA,646.0,589.0,243.0,38.0,655.0,58.0,1.346749,9.125387,0.80805,0.444272,group_totals
4,MIL,644.333333,532.0,222.0,24.0,638.0,72.0,1.20745,8.911536,1.005691,0.617693,group_totals


Unnamed: 0,pitching_team,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_BULLPEN,17656.666667,16322.0,7082.0,909.0,17398.0,2143.0,1.376987,8.868152,1.092335,0.964848,weighted_by_IP (from totals)
1,LAD,649.333333,608.0,270.0,38.0,687.0,81.0,1.410678,9.522074,1.12269,0.928645,group_totals
2,CWS,648.666667,610.0,295.0,32.0,645.0,79.0,1.444502,8.949126,1.096095,1.106886,group_totals
3,NYM,627.333333,591.0,233.0,36.0,621.0,66.0,1.370882,8.909139,0.946865,0.674283,group_totals
4,MIL,626.666667,543.0,244.0,29.0,620.0,61.0,1.302128,8.904255,0.876064,0.593617,group_totals


### Imputation

Now, we will fill in the early-season missing rolling pitching features using **prior-season summaries only**, consistent with our no-leakage approach. For each season dataset, the function identifies the rolling rate columns (e.g., 3-day and 7-day rolling WHIP, K/9, HR/9, and FIP) and imputes values only where they are structurally missing. 

In the **starter** case, missing rolling features are filled using that pitcher’s previous-season summary values when available; if the pitcher did not appear in the prior season or their value is unavailable, the function `impute_pitching_roll_rates_from_prev_season` falls back to the league-wide starter baseline (`ALL_STARTER`). In the **bullpen** case, missing values are filled using the team’s previous-season bullpen summary, with a fallback to the league-wide bullpen baseline (`ALL_BULLPEN`) if needed.


In [46]:
for year in range(2022, 2026):  
    prev_year = year - 1

    # starters
    globals()[f"starter_lines_{year}"] = impute_pitching_roll_rates_from_prev_season(
        season_df=globals()[f"starter_lines_{year}"],
        prev_summary_df=globals()[f"starter_summary_{prev_year}"],
        kind="starter",
    )

    # bullpen
    globals()[f"bullpen_lines_{year}"] = impute_pitching_roll_rates_from_prev_season(
        season_df=globals()[f"bullpen_lines_{year}"],
        prev_summary_df=globals()[f"bullpen_summary_{prev_year}"],
        kind="bullpen",
    )

In [47]:
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(starter_lines_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(starter_lines_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(starter_lines_2024.head(5))
display(HTML("<h4>Starting Pitcher Season 2025</h4>")); display(starter_lines_2025.head(5))
print(" ")
display(HTML("<h4>Bullpen Pitcher Season 2022</h4>")); display(bullpen_lines_2022.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2023</h4>")); display(bullpen_lines_2023.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2024</h4>")); display(bullpen_lines_2024.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2025</h4>")); display(bullpen_lines_2025.head(5))


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,...,did_not_end_pa,starter_full_game,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,662529,2022-08-02,WSH,WSH,NYM,1,starter,"Abbott, Cory",5.0,2.0,...,0,0,1.2,7.2,3.6,4.8,1.2,7.2,3.6,4.8
1,661558,2022-08-07,WSH,PHI,WSH,0,starter,"Abbott, Cory",3.333333,7.0,...,0,0,1.0,5.4,0.0,0.6,1.0,5.4,0.0,0.6
2,662551,2022-08-12,WSH,WSH,SD,1,starter,"Abbott, Cory",4.0,4.0,...,0,0,3.9,5.4,10.8,19.8,2.16,5.4,4.32,8.28
3,662470,2022-08-17,WSH,WSH,CHC,1,starter,"Abbott, Cory",6.0,3.0,...,0,0,1.75,11.25,0.0,-0.25,2.727273,8.590909,4.909091,8.863636
4,661924,2022-09-07,WSH,STL,WSH,0,starter,"Abbott, Cory",4.0,5.0,...,0,0,0.666667,7.5,1.5,1.0,1.1,9.0,0.9,0.5


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,...,did_not_end_pa,starter_full_game,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,717883,2023-06-05,CIN,CIN,MIL,1,starter,"Abbott, Andrew",6.0,1,...,0,0,1.317569,8.274553,1.174007,0.940789,1.317569,8.274553,1.174007,0.940789
1,717817,2023-06-10,CIN,STL,CIN,0,starter,"Abbott, Andrew",5.666667,5,...,0,0,0.833333,9.0,0.0,0.0,0.833333,9.0,0.0,0.0
2,717737,2023-06-16,CIN,HOU,CIN,0,starter,"Abbott, Andrew",6.0,4,...,0,0,1.411765,6.352941,0.0,0.176471,1.114286,7.714286,0.0,0.085714
3,717669,2023-06-21,CIN,CIN,COL,1,starter,"Abbott, Andrew",6.0,4,...,0,0,1.0,3.0,0.0,0.333333,1.2,4.628571,0.0,0.257143
4,717601,2023-06-27,CIN,BAL,CIN,0,starter,"Abbott, Andrew",6.0,2,...,0,0,0.666667,15.0,4.5,3.166667,0.833333,9.0,2.25,1.75


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,...,did_not_end_pa,starter_full_game,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,745600,2024-04-01,CIN,PHI,CIN,0,starter,"Abbott, Andrew",5.333333,3.0,...,0,0,1.359375,10.125,1.35,0.965625,1.359375,10.125,1.35,0.965625
1,746734,2024-04-07,CIN,CIN,NYM,1,starter,"Abbott, Andrew",5.0,7.0,...,0,0,0.9375,6.75,0.0,-0.375,0.9375,6.75,0.0,-0.375
2,746812,2024-04-12,CIN,CWS,CIN,0,starter,"Abbott, Andrew",7.0,4.0,...,0,0,2.0,7.2,1.8,2.8,1.451613,6.967742,0.870968,1.16129
3,745267,2024-04-17,CIN,SEA,CIN,0,starter,"Abbott, Andrew",6.0,4.0,...,0,0,0.571429,3.857143,0.0,-0.857143,1.166667,5.25,0.75,0.666667
4,746722,2024-04-23,CIN,CIN,PHI,1,starter,"Abbott, Andrew",4.333333,2.0,...,0,0,1.166667,9.0,3.0,3.833333,0.846154,6.230769,1.384615,1.307692


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,pitcher_name,IP,H,...,did_not_end_pa,starter_full_game,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,778357,2025-04-12,CIN,CIN,PIT,1,starter,"Abbott, Andrew",5.0,2,...,0,0,1.32439,7.507317,1.646341,1.895122,1.32439,7.507317,1.646341,1.895122
1,778276,2025-04-18,CIN,BAL,CIN,0,starter,"Abbott, Andrew",6.0,2,...,0,0,0.8,9.0,1.8,1.8,0.8,9.0,1.8,1.8
2,778181,2025-04-25,CIN,COL,CIN,0,starter,"Abbott, Andrew",4.0,5,...,0,0,0.5,16.5,1.5,-1.0,0.636364,13.090909,1.636364,0.272727
3,778107,2025-05-01,CIN,CIN,STL,1,starter,"Abbott, Andrew",4.0,3,...,0,0,2.5,9.0,2.25,5.0,2.5,9.0,2.25,5.0
4,778034,2025-05-06,CIN,ATL,CIN,0,starter,"Abbott, Andrew",5.0,4,...,0,0,1.75,6.75,0.0,1.5,2.125,7.875,1.125,3.25


 


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,...,did_not_end_pa,starter_full_game,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,661131,2022-04-08,ATH,PHI,ATH,0,bullpen,2.666667,5,4,...,0,0,1.3298,7.911471,1.228803,1.21197,1.3298,7.911471,1.228803,1.21197
1,661130,2022-04-09,ATH,PHI,ATH,0,bullpen,2.333333,0,3,...,0,0,3.375,6.75,0.0,3.0,3.375,6.75,0.0,3.0
2,661129,2022-04-10,ATH,PHI,ATH,0,bullpen,4.0,1,1,...,0,0,2.4,9.0,0.0,2.2,2.4,9.0,0.0,2.2
3,661915,2022-04-11,ATH,TB,ATH,0,bullpen,4.0,5,0,...,0,0,1.555556,10.0,1.0,1.888889,1.555556,10.0,1.0,1.888889
4,661944,2022-04-12,ATH,TB,ATH,0,bullpen,8.0,8,5,...,0,0,0.967742,12.193548,0.870968,-0.290323,1.461538,11.076923,0.692308,0.384615


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,...,did_not_end_pa,starter_full_game,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,718769,2023-03-30,ATH,ATH,LAA,1,bullpen,4.0,1,3,...,0,0,1.439438,8.547104,1.042715,1.049737,1.439438,8.547104,1.042715,1.049737
1,718757,2023-04-01,ATH,ATH,LAA,1,bullpen,6.666667,6,3,...,0,0,1.0,11.25,0.0,-0.25,1.0,11.25,0.0,-0.25
2,718734,2023-04-02,ATH,ATH,LAA,1,bullpen,3.333333,2,1,...,0,0,1.21875,6.75,0.84375,1.40625,1.21875,6.75,0.84375,1.40625
3,718721,2023-04-03,ATH,ATH,CLE,1,bullpen,5.0,9,2,...,0,0,1.2,4.5,0.9,1.5,1.142857,6.428571,0.642857,1.0
4,718707,2023-04-04,ATH,ATH,CLE,1,bullpen,4.333333,0,2,...,0,0,1.6,4.8,0.6,1.2,1.473684,6.157895,0.473684,0.894737


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,...,did_not_end_pa,starter_full_game,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,745687,2024-03-28,ATH,ATH,CLE,1,bullpen,5.333333,4,2,...,0,0,1.603269,8.424963,1.096582,1.566122,1.603269,8.424963,1.096582,1.566122
1,745682,2024-03-29,ATH,ATH,CLE,1,bullpen,3.666667,2,4,...,0,0,1.5,6.75,0.0,0.75,1.5,6.75,0.0,0.75
2,745684,2024-03-30,ATH,ATH,CLE,1,bullpen,5.333333,9,3,...,0,0,1.555556,7.0,0.0,1.111111,1.555556,7.0,0.0,1.111111
3,745683,2024-03-31,ATH,ATH,CLE,1,bullpen,1.666667,4,1,...,0,0,1.883721,7.534884,0.627907,1.744186,1.883721,7.534884,0.627907,1.744186
4,745675,2024-04-01,ATH,ATH,BOS,1,bullpen,6.333333,1,3,...,0,0,2.34375,8.4375,0.84375,2.15625,2.0625,7.875,0.5625,1.6875


Unnamed: 0,game_id,game_date,pitching_team,home_team,away_team,is_home_team,pitcher_role,IP,H,BB,...,did_not_end_pa,starter_full_game,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,778547,2025-03-27,ATH,SEA,ATH,0,bullpen,1.666667,2,3,...,0,0,1.383833,9.234031,0.793669,0.656303,1.383833,9.234031,0.793669,0.656303
1,778541,2025-03-28,ATH,SEA,ATH,0,bullpen,3.0,2,2,...,0,0,3.0,5.4,10.8,19.8,3.0,5.4,10.8,19.8
2,778521,2025-03-29,ATH,SEA,ATH,0,bullpen,4.0,2,2,...,0,0,1.928571,9.642857,3.857143,6.642857,1.928571,9.642857,3.857143,6.642857
3,778513,2025-03-30,ATH,SEA,ATH,0,bullpen,1.0,0,0,...,0,0,1.5,13.5,2.076923,2.423077,1.5,13.5,2.076923,2.423077
4,778501,2025-03-31,ATH,ATH,CHC,1,bullpen,5.0,12,6,...,0,0,1.0,15.75,0.0,-2.0,1.344828,13.965517,1.862069,1.758621


### Quality Check 

Below, we summarize missingness in the imputed starter and bullpen datasets for each season (2022–2025). This check confirms that the imputation step removed structural NaNs from early-season rolling features before we merge starter and bullpen data.


In [48]:
years = range(2022, 2026)

for year in years:
    st = globals()[f"starter_lines_{year}"]
    bp = globals()[f"bullpen_lines_{year}"]

    print(f"\n==== {year} ====")
    print("starter:", missing_summary(st))
    print("bullpen:", missing_summary(bp))



==== 2022 ====
starter: {'rows': 4860, 'total_na_cells': 0, 'rows_with_any_na': 0, 'cols_with_any_na': 0}
bullpen: {'rows': 4824, 'total_na_cells': 0, 'rows_with_any_na': 0, 'cols_with_any_na': 0}

==== 2023 ====
starter: {'rows': 4860, 'total_na_cells': 0, 'rows_with_any_na': 0, 'cols_with_any_na': 0}
bullpen: {'rows': 4825, 'total_na_cells': 0, 'rows_with_any_na': 0, 'cols_with_any_na': 0}

==== 2024 ====
starter: {'rows': 4858, 'total_na_cells': 0, 'rows_with_any_na': 0, 'cols_with_any_na': 0}
bullpen: {'rows': 4830, 'total_na_cells': 0, 'rows_with_any_na': 0, 'cols_with_any_na': 0}

==== 2025 ====
starter: {'rows': 4860, 'total_na_cells': 0, 'rows_with_any_na': 0, 'cols_with_any_na': 0}
bullpen: {'rows': 4831, 'total_na_cells': 0, 'rows_with_any_na': 0, 'cols_with_any_na': 0}


### Combining Pitching Features at the Game Level

After computing rolling pitching metrics for **starting pitchers** (pitcher-level) and the **bullpen** (team-level), we can finally combine these features into a single **game-level** dataset keyed by `game_id`. This produces one row per game containing the rolling 3-day and 7-day metrics for both the home and away teams.

In addition, we retain the **home and away starting pitcher names** and append `_home` and `_away` suffixes to each feature to clearly indicate which side the metric corresponds to. This consolidated table is then used to compute home–away differences and to merge pitching features with our batting features.


In [49]:
for year in range(2021, 2026):
    globals()[f"game_pitching_rates_{year}"] = combine_game_level_pitching_rolling_rates(
        starter_df=globals()[f"starter_lines_{year}"],
        bullpen_df=globals()[f"bullpen_lines_{year}"],
        windows=("3D", "7D"),
        metrics=("WHIP", "K9", "HR9", "FIP"),
    )

In [50]:
display(HTML("<h4>Pitcher Season 2022</h4>")); display(game_pitching_rates_2022.head(5))
display(HTML("<h4>Pitcher Season 2023</h4>")); display(game_pitching_rates_2023.head(5))
display(HTML("<h4>Pitcher Season 2024</h4>")); display(game_pitching_rates_2024.head(5))
display(HTML("<h4>Pitcher Season 2025</h4>")); display(game_pitching_rates_2025.head(5))


Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_full_game_home,did_not_end_pa_home_x,roll_3D_starter_WHIP_home,roll_3D_starter_K9_home,roll_3D_starter_HR9_home,...,roll_7D_bullpen_FIP_home,did_not_end_pa_away_y,roll_3D_bullpen_WHIP_away,roll_3D_bullpen_K9_away,roll_3D_bullpen_HR9_away,roll_3D_bullpen_FIP_away,roll_7D_bullpen_WHIP_away,roll_7D_bullpen_K9_away,roll_7D_bullpen_HR9_away,roll_7D_bullpen_FIP_away
0,661042,2022-04-07,LAA,HOU,"Ohtani, Shohei",0,0,1.169231,10.8,1.038462,...,1.030761,0.0,1.376932,9.96849,1.187872,1.041617,1.376932,9.96849,1.187872,1.041617
1,661577,2022-04-07,ATL,CIN,"Fried, Max",0,0,1.14959,8.741803,0.829918,...,0.923619,0.0,1.445954,10.589486,1.467218,1.408742,1.445954,10.589486,1.467218,1.408742
2,662021,2022-04-07,STL,PIT,"Wainwright, Adam",0,0,1.109121,7.651466,0.923453,...,0.875938,0.0,1.453882,9.01876,1.181866,1.32569,1.453882,9.01876,1.181866,1.32569
3,662571,2022-04-07,WSH,NYM,"Corbin, Patrick",0,0,1.502947,7.585462,1.962672,...,1.709447,0.0,1.386712,10.088586,1.165247,0.897785,1.386712,10.088586,1.165247,0.897785
4,662766,2022-04-07,KC,CLE,"Greinke, Zack",0,0,1.192843,6.280318,1.55666,...,1.039717,0.0,1.347079,9.927835,1.190722,0.900344,1.347079,9.927835,1.190722,0.900344


Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_full_game_home,did_not_end_pa_home_x,roll_3D_starter_WHIP_home,roll_3D_starter_K9_home,roll_3D_starter_HR9_home,...,roll_7D_bullpen_FIP_home,did_not_end_pa_away_y,roll_3D_bullpen_WHIP_away,roll_3D_bullpen_K9_away,roll_3D_bullpen_HR9_away,roll_3D_bullpen_FIP_away,roll_7D_bullpen_WHIP_away,roll_7D_bullpen_K9_away,roll_7D_bullpen_HR9_away,roll_7D_bullpen_FIP_away
0,718767,2023-03-30,SEA,CLE,"Castillo, Luis",0,0,1.147651,10.087248,0.785235,...,0.540892,0.0,1.165851,9.78213,0.875765,0.280906,1.165851,9.78213,0.875765,0.280906
1,718768,2023-03-30,HOU,CWS,"Valdez, Framber",0,0,1.222037,8.744574,0.495826,...,-0.063094,0.0,1.357474,9.667439,0.9073,0.491889,1.357474,9.667439,0.9073,0.491889
2,718769,2023-03-30,ATH,LAA,"Muller, Kyle",0,0,1.615385,8.307692,1.384615,...,1.049737,0.0,1.284742,8.469087,1.240499,1.206466,1.284742,8.469087,1.240499,1.206466
3,718770,2023-03-30,LAD,AZ,"Urías, Julio",0,0,0.996161,8.602687,1.191939,...,0.131261,0.0,1.453888,7.779385,1.155515,1.291139,1.453888,7.779385,1.155515,1.291139
4,718772,2023-03-30,STL,TOR,"Mikolas, Miles",0,0,1.076142,6.989848,1.142132,...,0.814008,0.0,1.293956,9.01978,1.171978,0.924725,1.293956,9.01978,1.171978,0.924725


Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_full_game_home,did_not_end_pa_home_x,roll_3D_starter_WHIP_home,roll_3D_starter_K9_home,roll_3D_starter_HR9_home,...,roll_7D_bullpen_FIP_home,did_not_end_pa_away_y,roll_3D_bullpen_WHIP_away,roll_3D_bullpen_K9_away,roll_3D_bullpen_HR9_away,roll_3D_bullpen_FIP_away,roll_7D_bullpen_WHIP_away,roll_7D_bullpen_K9_away,roll_7D_bullpen_HR9_away,roll_7D_bullpen_FIP_away
0,745444,2024-03-20,SD,LAD,"Darvish, Yu",0,0,1.369727,9.44665,1.205955,...,0.97373,0.0,1.235662,9.347237,0.957247,0.478624,1.235662,9.347237,0.957247,0.478624
1,746175,2024-03-21,LAD,SD,"Yamamoto, Yoshinobu",0,0,1.367453,8.576037,1.340462,...,-1.5,0.0,2.25,6.75,0.0,2.4375,2.25,6.75,0.0,2.4375
2,745039,2024-03-28,TEX,CHC,"Eovaldi, Nathan",0,0,1.197183,8.366197,0.950704,...,1.199286,0.0,1.364583,10.015625,1.0,0.807292,1.364583,10.015625,1.0,0.807292
3,745116,2024-03-28,TB,TOR,"Eflin, Zach",0,0,1.061185,9.602294,0.98088,...,0.776363,0.0,1.275135,9.850692,1.154124,0.659241,1.275135,9.850692,1.154124,0.659241
4,745283,2024-03-28,SEA,BOS,"Castillo, Luis",0,0,1.133333,10.107692,1.292308,...,0.546626,0.0,1.47298,8.83788,1.083891,0.968091,1.47298,8.83788,1.083891,0.968091


Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_full_game_home,did_not_end_pa_home_x,roll_3D_starter_WHIP_home,roll_3D_starter_K9_home,roll_3D_starter_HR9_home,...,roll_7D_bullpen_FIP_home,did_not_end_pa_away_y,roll_3D_bullpen_WHIP_away,roll_3D_bullpen_K9_away,roll_3D_bullpen_HR9_away,roll_3D_bullpen_FIP_away,roll_7D_bullpen_WHIP_away,roll_7D_bullpen_K9_away,roll_7D_bullpen_HR9_away,roll_7D_bullpen_FIP_away
0,778563,2025-03-18,CHC,LAD,"Imanaga, Shota",0,0,1.056751,9.193738,1.426614,...,0.785925,0.0,1.259937,8.839958,1.101464,0.902197,1.259937,8.839958,1.101464,0.902197
1,778564,2025-03-19,CHC,LAD,"Steele, Justin",0,0,1.134328,9.067164,0.80597,...,-0.4,0.0,0.25,11.25,0.0,-1.75,0.25,11.25,0.0,-1.75
2,778545,2025-03-27,SD,ATL,"King, Michael",0,0,1.232143,10.5,0.857143,...,0.434604,0.0,1.234853,10.017489,0.961274,0.247345,1.234853,10.017489,0.961274,0.247345
3,778546,2025-03-27,LAD,DET,"Snell, Blake",0,0,1.067961,12.669903,0.524272,...,-1.9,0.0,1.192736,8.158838,0.993705,0.681356,1.192736,8.158838,0.993705,0.681356
4,778547,2025-03-27,SEA,ATH,"Gilbert, Logan",0,0,0.910112,9.53451,1.126806,...,0.749484,0.0,1.383833,9.234031,0.793669,0.656303,1.383833,9.234031,0.793669,0.656303


### Quality Checks

Below, I perform a series of quality checks to validate the integrity of the merged game-level pitching dataset.

First, I confirm that there are **no missing rolling rate values for starting pitchers** across all seasons. Since starting pitcher rolling features were fully imputed prior to merging, the combined dataset should contain complete starter information for every game.

Second, I verify that any missing bullpen rolling values occur **only in games where a starter pitched a complete game**. In these cases, the bullpen did not appear, so missing bullpen features are structurally expected and not indicative of data issues.

Together, these checks ensure that:
- Starter rolling metrics are fully populated.
- Bullpen missingness is entirely explained by complete-game starts.
- No unintended data loss or merge inconsistencies occurred during dataset construction.

In [51]:
for year in range(2022, 2026):
    df = globals()[f"game_pitching_rates_{year}"]

    # keep rows where any *starter rolling* column is missing
    starter_roll_cols = [c for c in df.columns if "_starter_" in c and c.startswith("roll_")]
    bad = df[df[starter_roll_cols].isna().any(axis=1)].copy()

    print(f"\n===== {year} GAME_PITCHING_RATES (starter cols) =====")
    print(f"Rows with any NA in starter roll cols: {len(bad)}")

    if len(bad) == 0:
        continue

    na_cols = bad[starter_roll_cols].isna().sum()
    na_cols = na_cols[na_cols > 0].sort_values(ascending=False)
    print("\nMissing counts by starter roll column:")
    print(na_cols)

    show_cols = [c for c in ["game_id","game_date","home_team","away_team",
                             "starter_pitcher_name_home","starter_pitcher_name_away"]
                 if c in bad.columns] + na_cols.index.tolist()

    print("\nExample rows:")
    print(bad[show_cols].head(25))


===== 2022 GAME_PITCHING_RATES (starter cols) =====
Rows with any NA in starter roll cols: 0

===== 2023 GAME_PITCHING_RATES (starter cols) =====
Rows with any NA in starter roll cols: 0

===== 2024 GAME_PITCHING_RATES (starter cols) =====
Rows with any NA in starter roll cols: 0

===== 2025 GAME_PITCHING_RATES (starter cols) =====
Rows with any NA in starter roll cols: 0


In [52]:
years = range(2022, 2026)

for year in years:
    df = globals()[f"game_pitching_rates_{year}"]

    bullpen_roll = [c for c in df.columns if c.startswith("roll_") and "_bullpen_" in c]
    if not bullpen_roll:
        print(f"\n==== {year} ====\nNo bullpen rolling columns found.")
        continue

    bullpen_missing = df[bullpen_roll].isna().any(axis=1)

    # full game if either starter threw a complete game
    full_game = (
        pd.to_numeric(df.get("starter_full_game_home", 0), errors="coerce").fillna(0).astype(int).eq(1)
        | pd.to_numeric(df.get("starter_full_game_away", 0), errors="coerce").fillna(0).astype(int).eq(1)
    )

    print(f"\n==== {year} ====")
    print(f"Rows with any NA in bullpen roll cols: {int(bullpen_missing.sum())}")
    print(f"Rows where starter_full_game_home OR starter_full_game_away == 1: {int(full_game.sum())}")
    print(f"Overlap (bullpen missing AND full game): {int((bullpen_missing & full_game).sum())}")
    print(f"Bullpen missing BUT NOT full game: {int((bullpen_missing & ~full_game).sum())}")
    print(f"Full game BUT bullpen NOT missing: {int((~bullpen_missing & full_game).sum())}")


==== 2022 ====
Rows with any NA in bullpen roll cols: 34
Rows where starter_full_game_home OR starter_full_game_away == 1: 34
Overlap (bullpen missing AND full game): 34
Bullpen missing BUT NOT full game: 0
Full game BUT bullpen NOT missing: 0

==== 2023 ====
Rows with any NA in bullpen roll cols: 34
Rows where starter_full_game_home OR starter_full_game_away == 1: 34
Overlap (bullpen missing AND full game): 34
Bullpen missing BUT NOT full game: 0
Full game BUT bullpen NOT missing: 0

==== 2024 ====
Rows with any NA in bullpen roll cols: 27
Rows where starter_full_game_home OR starter_full_game_away == 1: 27
Overlap (bullpen missing AND full game): 27
Bullpen missing BUT NOT full game: 0
Full game BUT bullpen NOT missing: 0

==== 2025 ====
Rows with any NA in bullpen roll cols: 28
Rows where starter_full_game_home OR starter_full_game_away == 1: 28
Overlap (bullpen missing AND full game): 28
Bullpen missing BUT NOT full game: 0
Full game BUT bullpen NOT missing: 0


#### Analysis

No starting pitcher rolling data is missing across any season, confirming that the imputation and merging steps preserved complete starter information.

All missing bullpen rolling values are fully explained by games in which a starting pitcher threw a complete game. In these cases, the bullpen did not appear, so the absence of bullpen metrics is structurally expected rather than indicative of data loss.

With these validations in place, we can proceed to carry the bullpen data forward in a controlled and methodologically consistent manner.

### Filling in Rolling Bullpens: Carry Forward

When a starting pitcher throws a complete game, the bullpen does not appear, so the bullpen rolling-rate features (e.g., 3-day and 7-day WHIP/K9/HR9/FIP) show up as missing even though nothing is “wrong” with the data. To avoid leaving structural missingness in these cases, I **carry forward** the bullpen rolling metrics by using the team’s most recent previously-available bullpen rolling values.


In [53]:
for year in range(2022, 2026):
    name = f"game_pitching_rates_{year}"
    df = globals().get(name)

    if df is None:
        print(f"{name}: not found")
        continue

    bullpen_cols = [c for c in df.columns if "_bullpen_" in c]
    before_na_rows = int(df[bullpen_cols].isna().any(axis=1).sum())

    fixed = carry_forward_bullpen_rolls_on_full_games(df)

    after_na_rows = int(fixed[bullpen_cols].isna().any(axis=1).sum())

    globals()[name] = fixed

    print(
        f"{year}: bullpen NA rows before={before_na_rows} | after={after_na_rows}"
    )

2022: bullpen NA rows before=34 | after=0
2023: bullpen NA rows before=34 | after=0
2024: bullpen NA rows before=27 | after=0
2025: bullpen NA rows before=28 | after=0


### Pitching Deltas (Home − Away)

After constructing game-level pitching features for both teams, we compute **home–away differences** for each rolling metric. These deltas summarize the relative pitching advantage in a single value per game (e.g., higher starter K/9 for the home team versus the away team).

The resulting dataset retains only the game identifiers and starting pitcher names, along with Δ features for rolling **starter** metrics (FIP, WHIP, K/9, HR/9) and rolling **bullpen FIP** for each window (3-day and 7-day). This format is convenient for downstream modeling and for merging with game-level batting features.


In [54]:
for year in range(2022, 2026):
    globals()[f"game_pitching_deltas_{year}"] = make_pitching_delta_df(
        globals()[f"game_pitching_rates_{year}"]
    )


In [55]:
display(HTML("<h4> Pitcher Season 2022</h4>")); display(game_pitching_deltas_2022.head(5))
display(HTML("<h4> Pitcher Season 2023</h4>")); display(game_pitching_deltas_2023.head(5))
display(HTML("<h4> Pitcher Season 2024</h4>")); display(game_pitching_deltas_2024.head(5))
display(HTML("<h4> Pitcher Season 2025</h4>")); display(game_pitching_deltas_2025.head(5))


Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,Δbullpen_FIP_3D,Δstarter_FIP_7D,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D
0,661042,2022-04-07,LAA,HOU,"Ohtani, Shohei","Valdez, Framber",-0.501346,-0.173269,2.3625,0.228462,-0.010856,-0.501346,-0.173269,2.3625,0.228462,-0.010856
1,661577,2022-04-07,ATL,CIN,"Fried, Max","Mahle, Tyler",-0.500258,-0.156226,-1.896095,-0.385842,-0.485123,-0.500258,-0.156226,-1.896095,-0.385842,-0.485123
2,662021,2022-04-07,STL,PIT,"Wainwright, Adam","Brubaker, JT",-1.496179,-0.272351,-1.838997,-1.136493,-0.449753,-1.496179,-0.272351,-1.838997,-1.136493,-0.449753
3,662571,2022-04-07,WSH,NYM,"Corbin, Patrick","Megill, Tylor",0.740873,0.193245,-2.388419,0.048493,0.811662,0.740873,0.193245,-2.388419,0.048493,0.811662
4,662766,2022-04-07,KC,CLE,"Greinke, Zack","Bieber, Shane",1.678132,-0.063212,-6.238713,0.528978,0.139373,1.678132,-0.063212,-6.238713,0.528978,0.139373


Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,Δbullpen_FIP_3D,Δstarter_FIP_7D,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D
0,718767,2023-03-30,SEA,CLE,"Castillo, Luis","Bieber, Shane",0.202156,0.087045,1.087248,-0.032947,0.259986,0.202156,0.087045,1.087248,-0.032947,0.259986
1,718768,2023-03-30,HOU,CWS,"Valdez, Framber","Cease, Dylan",-0.038578,0.082587,-2.501297,-0.296834,-0.554982,-0.038578,0.082587,-2.501297,-0.296834,-0.554982
2,718769,2023-03-30,ATH,LAA,"Muller, Kyle","Ohtani, Shohei",2.71371,0.587159,-3.613679,0.622519,-0.15673,2.71371,0.587159,-3.613679,0.622519,-0.15673
3,718770,2023-03-30,LAD,AZ,"Urías, Julio","Gallen, Zac",0.670662,0.005336,-0.909239,0.448819,-1.159878,0.670662,0.005336,-0.909239,0.448819,-1.159878
4,718772,2023-03-30,STL,TOR,"Mikolas, Miles","Manoah, Alek",0.545019,-0.004475,-1.346344,0.401137,-0.110717,0.545019,-0.004475,-1.346344,0.401137,-0.110717


Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,Δbullpen_FIP_3D,Δstarter_FIP_7D,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D
0,745444,2024-03-20,SD,LAD,"Darvish, Yu","Glasnow, Tyler",1.100972,0.27806,-2.70335,0.230955,0.495107,1.100972,0.27806,-2.70335,0.230955,0.495107
1,746175,2024-03-21,LAD,SD,"Yamamoto, Yoshinobu","Musgrove, Joe",0.910845,0.121779,-0.486247,0.406206,-3.9375,0.910845,0.121779,-0.486247,0.406206,-3.9375
2,745039,2024-03-28,TEX,CHC,"Eovaldi, Nathan","Steele, Justin",0.871255,-0.019033,-0.807548,0.220974,0.391994,0.871255,-0.019033,-0.807548,0.220974,0.391994
3,745116,2024-03-28,TB,TOR,"Eflin, Zach","Berríos, José",-1.009147,-0.203679,0.650943,-0.235337,0.117121,-1.009147,-0.203679,0.650943,-0.235337,0.117121
4,745283,2024-03-28,SEA,BOS,"Castillo, Luis","Bello, Brayan",-0.743401,-0.266235,2.410068,-0.10726,-0.421465,-0.743401,-0.266235,2.410068,-0.10726,-0.421465


Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,Δbullpen_FIP_3D,Δstarter_FIP_7D,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D
0,778563,2025-03-18,CHC,LAD,"Imanaga, Shota","Yamamoto, Yoshinobu",1.123302,-0.073846,-1.38462,0.721391,-0.116271,1.123302,-0.073846,-1.38462,0.721391,-0.116271
1,778564,2025-03-19,CHC,LAD,"Steele, Justin","Sasaki, Roki",-0.933196,-0.189999,0.617142,-0.422758,1.35,-0.933196,-0.189999,0.617142,-0.422758,1.35
2,778545,2025-03-27,SD,ATL,"King, Michael","Sale, Chris",1.198333,0.157857,-1.071429,0.394286,0.187259,1.198333,0.157857,-1.071429,0.394286,0.187259
3,778546,2025-03-27,LAD,DET,"Snell, Blake","Skubal, Tarik",-0.070999,0.092437,1.907665,-0.18377,-2.581356,-0.070999,0.092437,1.907665,-0.18377,-2.581356
4,778547,2025-03-27,SEA,ATH,"Gilbert, Logan","Severino, Luis",-0.960329,-0.41391,1.439538,-0.029619,0.093181,-0.960329,-0.41391,1.439538,-0.029619,0.093181


### Checking for Missing Values

Below, we perform a final check to confirm that there are no missing values remaining in the game-level pitching delta datasets for each season.

In [56]:
for year, df in {
    2022: game_pitching_deltas_2022,
    2023: game_pitching_deltas_2023,
    2024: game_pitching_deltas_2024,
    2025: game_pitching_deltas_2025,
}.items():
    total_na = int(df.isna().sum().sum())
    print(f"\n{year}\nTotal missing cells: {total_na}")
    assert total_na == 0, f"{year}: found {total_na} missing cells in game_pitching_deltas_{year}"


2022
Total missing cells: 0

2023
Total missing cells: 0

2024
Total missing cells: 0

2025
Total missing cells: 0


### Checking Number of Games

**TODO**: Make a function that can work wtih all other dataframes, and rather than return the dataframe, return True or False and have an assert statement

In [57]:
years = range(2022, 2026)

games_per_team_pitching = pd.concat(
    {
        year: pd.concat([
            globals()[f"game_pitching_deltas_{year}"][["game_id", "home_team"]]
                .rename(columns={"home_team": "team"}),
            globals()[f"game_pitching_deltas_{year}"][["game_id", "away_team"]]
                .rename(columns={"away_team": "team"}),
        ], ignore_index=True)
        .drop_duplicates()
        .groupby("team")["game_id"]
        .nunique()
        for year in years
    },
    axis=1
).sort_index()

games_per_team_pitching


Unnamed: 0_level_0,2022,2023,2024,2025
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ATH,162,162,162,162
ATL,162,162,162,162
AZ,162,162,162,162
BAL,162,162,162,162
BOS,162,162,162,162
CHC,162,162,162,162
CIN,162,162,162,162
CLE,162,162,161,162
COL,162,162,162,162
CWS,162,162,162,162


## Batter Metrics

Now, we can begin creating the features needed for modeling. We start by constructing pitching features based on the **starting pitcher**, including:


Now, we can calculate batter metrics. These are the team's

- On Base Percentage (OBP)

- Isolated Power (ISO)


$\text{OBP} = \frac{\text{H} + \text{BB} + \text{HBP}}
{\text{AB} + \text{BB} + \text{HBP} + \text{SF}}$

$\text{ISO} = \frac{2\text{B} + (2*3\text{B}) + (3*\text{HR})}{\text{AB}}
$


### Batting Indicators

Like pitchers, we create batting indicators that serve as building blocks for our batting feature calculations.


In [58]:
for y in range(2021, 2026):
    src_name = f"pa_{y}"
    dst_name = f"pa_batter_{y}"

    df = globals().get(src_name)
    if df is None:
        print(f"{src_name}: (not found)")
        continue

    globals()[dst_name] = add_batting_indicators(df.copy())
    print(f"{dst_name}: indicators added (from {src_name})")


pa_batter_2021: indicators added (from pa_2021)
pa_batter_2022: indicators added (from pa_2022)
pa_batter_2023: indicators added (from pa_2023)
pa_batter_2024: indicators added (from pa_2024)
pa_batter_2025: indicators added (from pa_2025)


In [59]:
display(HTML("<h4>Season 2021</h4>")); display(pa_batter_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(pa_batter_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(pa_batter_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(pa_batter_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(pa_batter_2025.head(5))


Unnamed: 0,game_id,game_date,home_team,away_team,batting_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,...,starter_full_game,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr
3,632169,2021-04-10,SF,COL,SF,1,Bot,2,0,0,...,0,0,0,0,0,0,0,0,0,0
14,632169,2021-04-10,SF,COL,SF,1,Bot,7,0,0,...,0,1,0,0,0,0,0,0,0,0
15,632169,2021-04-10,SF,COL,SF,1,Bot,7,0,0,...,0,0,0,0,0,0,1,0,0,0
19,632169,2021-04-10,SF,COL,SF,1,Bot,4,1,0,...,0,0,0,0,0,0,0,0,0,0
27,632169,2021-04-10,SF,COL,SF,1,Bot,8,2,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,game_id,game_date,home_team,away_team,batting_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,...,starter_full_game,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr
0,661032,2022-04-26,LAA,CLE,LAA,1,Bot,5,0,0,...,0,0,0,0,0,0,0,0,0,0
1,661032,2022-04-26,LAA,CLE,LAA,1,Bot,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,661032,2022-04-26,LAA,CLE,LAA,1,Bot,5,2,0,...,0,0,0,0,0,0,0,0,0,0
3,661032,2022-04-26,LAA,CLE,LAA,2,Bot,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,661032,2022-04-26,LAA,CLE,LAA,2,Bot,1,1,0,...,0,0,0,0,0,0,0,1,0,0


Unnamed: 0,game_id,game_date,home_team,away_team,batting_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,...,starter_full_game,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr
0,716352,2023-10-01,KC,NYY,NYY,1,Top,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,716352,2023-10-01,KC,NYY,NYY,1,Top,2,0,0,...,0,0,0,0,0,0,1,0,0,0
7,716352,2023-10-01,KC,NYY,NYY,1,Top,3,0,0,...,0,0,0,0,0,0,1,0,0,0
15,716352,2023-10-01,KC,NYY,NYY,1,Top,10,0,0,...,0,0,0,0,0,0,1,0,0,0
18,716352,2023-10-01,KC,NYY,NYY,1,Top,3,2,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,game_id,game_date,home_team,away_team,batting_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,...,starter_full_game,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr
0,744795,2024-09-25,WSH,KC,WSH,1,Bot,4,0,0,...,0,0,0,0,0,0,0,0,0,0
1,744795,2024-09-25,WSH,KC,WSH,1,Bot,6,1,0,...,0,0,0,0,0,0,0,0,0,0
2,744795,2024-09-25,WSH,KC,WSH,1,Bot,3,2,0,...,0,0,0,0,0,0,0,0,0,0
3,744795,2024-09-25,WSH,KC,WSH,2,Bot,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,744795,2024-09-25,WSH,KC,WSH,2,Bot,4,0,0,...,0,1,0,0,0,0,0,0,0,0


Unnamed: 0,game_id,game_date,home_team,away_team,batting_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,...,starter_full_game,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr
4,776135,2025-09-28,LAA,HOU,LAA,1,Bot,5,0,0,...,0,0,0,0,0,0,0,0,0,0
11,776135,2025-09-28,LAA,HOU,LAA,1,Bot,4,1,1,...,0,0,0,0,0,0,0,0,0,0
14,776135,2025-09-28,LAA,HOU,LAA,1,Bot,6,1,0,...,0,0,0,0,0,0,0,0,0,1
18,776135,2025-09-28,LAA,HOU,LAA,1,Bot,4,2,1,...,0,0,0,0,0,0,0,0,0,0
22,776135,2025-09-28,LAA,HOU,LAA,2,Bot,4,0,1,...,0,0,0,0,0,0,0,0,0,0


### Aggregate plate appearances to team-game batting totals

This function takes a plate-appearance–level batting DataFrame and aggregates it to one row per `(game_id, game_date, batting_team)`. It first derives game-level counting stats—hits (`H`), total bases (`TB`), and at-bats (`AB`)—from the PA indicators. In particular, `AB` excludes non–at-bat events (walks, hit-by-pitch, sacrifice flies, sacrifice bunts, catcher’s interference). It then groups by game and team to produce team-game totals such as `PA`, `AB`, `H`, `TB`, `BB`, `HBP`, `SF`, `SH`, `CI`, `HR`, and hit-type counts (`_1B`, `_2B`, `_3B`). These totals are intended as building blocks for downstream rate stats like OBP and ISO.


In [60]:
for y in range(2021, 2026):
    pa_name = f"pa_batter_{y}"
    out_name = f"team_game_batting_{y}"

    pa_df = globals().get(pa_name)
    if pa_df is None:
        print(f"{y}: missing {pa_name}")
        continue

    globals()[out_name] = aggregate_team_game_batting(pa_df)

    print(f"{y}: created {out_name} ({len(globals()[out_name]):,} rows)")


2021: created team_game_batting_2021 (4,858 rows)
2022: created team_game_batting_2022 (4,860 rows)
2023: created team_game_batting_2023 (4,860 rows)
2024: created team_game_batting_2024 (4,858 rows)
2025: created team_game_batting_2025 (4,860 rows)


In [61]:
display(HTML("<h4>Home Season 2021</h4>")); display(team_game_batting_2021.head(5))
display(HTML("<h4>Home Season 2022</h4>")); display(team_game_batting_2022.head(5))
display(HTML("<h4>Home Season 2023</h4>")); display(team_game_batting_2023.head(5))
display(HTML("<h4>Home Season 2024</h4>")); display(team_game_batting_2024.head(5))
display(HTML("<h4>Home Season 2025</h4>")); display(team_game_batting_2025.head(5))


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,HBP,SF,SH,CI,HR,_1B,_2B,_3B,is_home_batting
0,634640,2021-04-01,ATH,ATH,HOU,35,30,6,8,3,1,1,0,0,0,4,2,0,True
1,634605,2021-04-02,ATH,ATH,HOU,39,33,8,14,3,2,1,0,0,1,5,1,1,True
2,634629,2021-04-03,ATH,ATH,HOU,35,29,3,3,5,1,0,0,0,0,3,0,0,True
3,634651,2021-04-04,ATH,ATH,HOU,33,31,4,8,2,0,0,0,0,0,1,2,1,True
4,634600,2021-04-05,ATH,ATH,LAD,38,34,6,9,4,0,0,0,0,1,5,0,0,True


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,HBP,SF,SH,CI,HR,_1B,_2B,_3B,is_home_batting
0,661131,2022-04-08,ATH,PHI,ATH,35,34,6,13,0,1,0,0,0,2,3,1,0,False
1,661130,2022-04-09,ATH,PHI,ATH,31,30,5,6,1,0,0,0,0,0,4,1,0,False
2,661129,2022-04-10,ATH,PHI,ATH,39,35,9,14,3,1,0,0,0,1,6,2,0,False
3,661915,2022-04-11,ATH,TB,ATH,45,41,13,29,3,1,0,0,0,4,6,2,1,False
4,661944,2022-04-12,ATH,TB,ATH,45,35,8,12,9,1,0,0,0,1,6,1,0,False


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,HBP,SF,SH,CI,HR,_1B,_2B,_3B,is_home_batting
0,718769,2023-03-30,ATH,ATH,LAA,33,29,6,8,4,0,0,0,0,0,4,2,0,True
1,718757,2023-04-01,ATH,ATH,LAA,34,30,5,8,3,0,0,0,1,1,4,0,0,True
2,718734,2023-04-02,ATH,ATH,LAA,35,31,5,6,3,1,0,0,0,0,4,1,0,True
3,718721,2023-04-03,ATH,ATH,CLE,46,43,14,26,3,0,0,0,0,3,8,3,0,True
4,718707,2023-04-04,ATH,ATH,CLE,37,32,6,9,4,0,1,0,0,0,3,3,0,True


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,HBP,SF,SH,CI,HR,_1B,_2B,_3B,is_home_batting
0,745687,2024-03-28,ATH,ATH,CLE,31,30,4,5,1,0,0,0,0,0,3,1,0,True
1,745682,2024-03-29,ATH,ATH,CLE,36,32,8,16,2,0,2,0,0,2,4,2,0,True
2,745684,2024-03-30,ATH,ATH,CLE,40,32,6,7,6,2,0,0,0,0,5,1,0,True
3,745683,2024-03-31,ATH,ATH,CLE,36,30,9,14,4,1,0,1,0,0,6,1,2,True
4,745675,2024-04-01,ATH,ATH,BOS,31,31,4,5,0,0,0,0,0,0,3,1,0,True


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,HBP,SF,SH,CI,HR,_1B,_2B,_3B,is_home_batting
0,778547,2025-03-27,ATH,SEA,ATH,30,29,3,9,1,0,0,0,0,2,1,0,0,False
1,778541,2025-03-28,ATH,SEA,ATH,44,38,12,23,5,1,0,0,0,3,7,2,0,False
2,778521,2025-03-29,ATH,SEA,ATH,38,36,9,12,2,0,0,0,0,1,8,0,0,False
3,778513,2025-03-30,ATH,SEA,ATH,34,30,4,8,3,0,0,1,0,1,2,1,0,False
4,778501,2025-03-31,ATH,ATH,CHC,38,36,10,15,2,0,0,0,0,1,7,2,0,True


### Making Copies for Missing Data

Similar to pitching, the first game for each home and away team will have missing data. We will address this by getting the mean OBP and ISO for each team from the **prior** season. 

In [62]:
for y in range(2021, 2026):
    name = f"team_game_batting_{y}"
    copy_name = f"{name}_copy"

    df = globals().get(name)
    if df is None:
        print(f"{y}: missing {name}")
        continue

    globals()[copy_name] = df.copy()

    print(f"{y}: created {copy_name}")

2021: created team_game_batting_2021_copy
2022: created team_game_batting_2022_copy
2023: created team_game_batting_2023_copy
2024: created team_game_batting_2024_copy
2025: created team_game_batting_2025_copy


### Add time-based rolling batting sums by team

This function adds **rolling sums** of team batting totals over pre-defined time windows (default: **3 days** and **7 days**). For each team, the rolling totals are computed using **only prior games** (by shifting one game), which prevents information from the current game from entering its own features. The resulting columns are named like `roll_3D_H`, `roll_7D_AB`, and `roll_3D_TB`, and the output is returned sorted by team and game date.



In [63]:
for y in range(2022, 2026):
    name = f"team_game_batting_{y}"
    df = globals().get(name)

    if df is None:
        print(f"{y}: missing {name}")
        continue

    globals()[name] = add_time_rolling_batting_sums(
        df,
        windows=("3D", "7D"),
        sum_cols=["AB", "H", "BB", "HBP", "SF", "HR", "_2B", "_3B"],
        min_periods=1,
    )

    print(f"{y}: updated {name} ({len(globals()[name]):,} rows)")

2022: updated team_game_batting_2022 (4,860 rows)
2023: updated team_game_batting_2023 (4,860 rows)


  out = df.groupby(team_col, group_keys=False, sort=False).apply(_apply)
  out = df.groupby(team_col, group_keys=False, sort=False).apply(_apply)
  out = df.groupby(team_col, group_keys=False, sort=False).apply(_apply)


2024: updated team_game_batting_2024 (4,858 rows)
2025: updated team_game_batting_2025 (4,860 rows)


  out = df.groupby(team_col, group_keys=False, sort=False).apply(_apply)


In [64]:
display(HTML("<h4>Season 2022</h4>")); display(team_game_batting_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(team_game_batting_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(team_game_batting_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(team_game_batting_2025.head(5))


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,roll_3D__2B,roll_3D__3B,roll_7D_AB,roll_7D_H,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B
0,661131,2022-04-08,ATH,PHI,ATH,35,34,6,13,0,...,,,,,,,,,,
1,661130,2022-04-09,ATH,PHI,ATH,31,30,5,6,1,...,1.0,0.0,34.0,6.0,0.0,1.0,0.0,2.0,1.0,0.0
2,661129,2022-04-10,ATH,PHI,ATH,39,35,9,14,3,...,2.0,0.0,64.0,11.0,1.0,1.0,0.0,2.0,2.0,0.0
3,661915,2022-04-11,ATH,TB,ATH,45,41,13,29,3,...,4.0,0.0,99.0,20.0,4.0,2.0,0.0,3.0,4.0,0.0
4,661944,2022-04-12,ATH,TB,ATH,45,35,8,12,9,...,5.0,1.0,140.0,33.0,7.0,3.0,0.0,7.0,6.0,1.0


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,roll_3D__2B,roll_3D__3B,roll_7D_AB,roll_7D_H,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B
0,718769,2023-03-30,ATH,ATH,LAA,33,29,6,8,4,...,,,,,,,,,,
1,718757,2023-04-01,ATH,ATH,LAA,34,30,5,8,3,...,2.0,0.0,29.0,6.0,4.0,0.0,0.0,0.0,2.0,0.0
2,718734,2023-04-02,ATH,ATH,LAA,35,31,5,6,3,...,2.0,0.0,59.0,11.0,7.0,0.0,0.0,1.0,2.0,0.0
3,718721,2023-04-03,ATH,ATH,CLE,46,43,14,26,3,...,3.0,0.0,90.0,16.0,10.0,1.0,0.0,1.0,3.0,0.0
4,718707,2023-04-04,ATH,ATH,CLE,37,32,6,9,4,...,4.0,0.0,133.0,30.0,13.0,1.0,0.0,4.0,6.0,0.0


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,roll_3D__2B,roll_3D__3B,roll_7D_AB,roll_7D_H,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B
0,745687,2024-03-28,ATH,ATH,CLE,31,30,4,5,1,...,,,,,,,,,,
1,745682,2024-03-29,ATH,ATH,CLE,36,32,8,16,2,...,1.0,0.0,30.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0
2,745684,2024-03-30,ATH,ATH,CLE,40,32,6,7,6,...,3.0,0.0,62.0,12.0,3.0,0.0,2.0,2.0,3.0,0.0
3,745683,2024-03-31,ATH,ATH,CLE,36,30,9,14,4,...,4.0,0.0,94.0,18.0,9.0,2.0,2.0,2.0,4.0,0.0
4,745675,2024-04-01,ATH,ATH,BOS,31,31,4,5,0,...,4.0,2.0,124.0,27.0,13.0,3.0,2.0,2.0,5.0,2.0


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,roll_3D__2B,roll_3D__3B,roll_7D_AB,roll_7D_H,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B
0,778547,2025-03-27,ATH,SEA,ATH,30,29,3,9,1,...,,,,,,,,,,
1,778541,2025-03-28,ATH,SEA,ATH,44,38,12,23,5,...,0.0,0.0,29.0,3.0,1.0,0.0,0.0,2.0,0.0,0.0
2,778521,2025-03-29,ATH,SEA,ATH,38,36,9,12,2,...,2.0,0.0,67.0,15.0,6.0,1.0,0.0,5.0,2.0,0.0
3,778513,2025-03-30,ATH,SEA,ATH,34,30,4,8,3,...,2.0,0.0,103.0,24.0,8.0,1.0,0.0,6.0,2.0,0.0
4,778501,2025-03-31,ATH,ATH,CHC,38,36,10,15,2,...,3.0,0.0,133.0,28.0,11.0,1.0,0.0,7.0,3.0,0.0


### Add rolling OBP and ISO from rolling batting totals

This step computes **rolling OBP** and **rolling ISO** directly from the rolling sum columns that were created earlier (e.g., rolling `AB`, `H`, `BB`, `HBP`, `SF`, `HR`, `2B`, `3B`). For each window (default: `3D`, `7D`), it derives:

- `roll_{w}_OBP` using the rolling totals in the OBP formula (with a denominator check to avoid divide-by-zero)
- `roll_{w}_ISO` using rolling extra-base production divided by rolling at-bats (also guarded against divide-by-zero)


In [65]:
for y in range(2022, 2026):
    name = f"team_game_batting_{y}"
    df = globals().get(name)

    if df is None:
        print(f"{y}: missing {name}")
        continue

    globals()[name] = add_rolling_obp_iso_batch({y: df})[y]

    print(f"{y}: updated {name}")


2022: updated team_game_batting_2022
2023: updated team_game_batting_2023
2024: updated team_game_batting_2024
2025: updated team_game_batting_2025


In [66]:
display(HTML("<h4>Season 2022</h4>")); display(team_game_batting_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(team_game_batting_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(team_game_batting_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(team_game_batting_2025.head(5))


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,661131,2022-04-08,ATH,PHI,ATH,35,34,6,13,0,...,,,,,,,,,,
1,661130,2022-04-09,ATH,PHI,ATH,31,30,5,6,1,...,0.0,1.0,0.0,2.0,1.0,0.0,0.2,0.205882,0.2,0.205882
2,661129,2022-04-10,ATH,PHI,ATH,39,35,9,14,3,...,1.0,1.0,0.0,2.0,2.0,0.0,0.19697,0.125,0.19697,0.125
3,661915,2022-04-11,ATH,TB,ATH,45,41,13,29,3,...,4.0,2.0,0.0,3.0,4.0,0.0,0.247619,0.131313,0.247619,0.131313
4,661944,2022-04-12,ATH,TB,ATH,45,35,8,12,9,...,7.0,3.0,0.0,7.0,6.0,1.0,0.313043,0.207547,0.286667,0.207143


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,718769,2023-03-30,ATH,ATH,LAA,33,29,6,8,4,...,,,,,,,,,,
1,718757,2023-04-01,ATH,ATH,LAA,34,30,5,8,3,...,4.0,0.0,0.0,0.0,2.0,0.0,0.30303,0.068966,0.30303,0.068966
2,718734,2023-04-02,ATH,ATH,LAA,35,31,5,6,3,...,7.0,0.0,0.0,1.0,2.0,0.0,0.272727,0.084746,0.272727,0.084746
3,718721,2023-04-03,ATH,ATH,CLE,46,43,14,26,3,...,10.0,1.0,0.0,1.0,3.0,0.0,0.267327,0.066667,0.267327,0.066667
4,718707,2023-04-04,ATH,ATH,CLE,37,32,6,9,4,...,13.0,1.0,0.0,4.0,6.0,0.0,0.298246,0.153846,0.29932,0.135338


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,745687,2024-03-28,ATH,ATH,CLE,31,30,4,5,1,...,,,,,,,,,,
1,745682,2024-03-29,ATH,ATH,CLE,36,32,8,16,2,...,1.0,0.0,0.0,0.0,1.0,0.0,0.16129,0.033333,0.16129,0.033333
2,745684,2024-03-30,ATH,ATH,CLE,40,32,6,7,6,...,3.0,0.0,2.0,2.0,3.0,0.0,0.223881,0.145161,0.223881,0.145161
3,745683,2024-03-31,ATH,ATH,CLE,36,30,9,14,4,...,9.0,2.0,2.0,2.0,4.0,0.0,0.271028,0.106383,0.271028,0.106383
4,745675,2024-04-01,ATH,ATH,BOS,31,31,4,5,0,...,13.0,3.0,2.0,2.0,5.0,2.0,0.342342,0.148936,0.302817,0.120968


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,778547,2025-03-27,ATH,SEA,ATH,30,29,3,9,1,...,,,,,,,,,,
1,778541,2025-03-28,ATH,SEA,ATH,44,38,12,23,5,...,1.0,0.0,0.0,2.0,0.0,0.0,0.133333,0.206897,0.133333,0.206897
2,778521,2025-03-29,ATH,SEA,ATH,38,36,9,12,2,...,6.0,1.0,0.0,5.0,2.0,0.0,0.297297,0.253731,0.297297,0.253731
3,778513,2025-03-30,ATH,SEA,ATH,34,30,4,8,3,...,8.0,1.0,0.0,6.0,2.0,0.0,0.294643,0.194175,0.294643,0.194175
4,778501,2025-03-31,ATH,ATH,CHC,38,36,10,15,2,...,11.0,1.0,0.0,7.0,3.0,0.0,0.313043,0.173077,0.275862,0.180451


## Missing Data

Next, we compute team-level summary batting metrics (including mean `OBP` and mean `ISO`) using the copied `team_game_batting_YYYY` DataFrames. We then use the prior season’s team means to fill the missing rolling `OBP` and `ISO` values at the start of the season—specifically for each team’s first game—when there is not yet enough historical data to compute rolling metrics.


In [67]:
for y in range(2021, 2026):
    name = f"team_game_batting_{y}_copy"
    out_name = f"team_batting_means_{y}"

    df = globals().get(name)
    if df is None:
        print(f"{y}: missing {name}")
        continue

    globals()[out_name] = calculate_mean_obp_iso(df)

    print(f"{y}: created {out_name}")


2021: created team_batting_means_2021
2022: created team_batting_means_2022
2023: created team_batting_means_2023
2024: created team_batting_means_2024
2025: created team_batting_means_2025


In [68]:
display(HTML("<h4>Season 2021</h4>")); display(team_batting_means_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(team_batting_means_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(team_batting_means_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(team_batting_means_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(team_batting_means_2025.head(5))


Unnamed: 0,batting_team,games,PA,AB,H,TB,BB,HBP,SF,SH,CI,HR,_1B,_2B,_3B,mean_OBP,mean_ISO
0,ATH,162,6104,5395,1284,2190,545,98,49,17,0,199,795,271,19,0.310238,0.165121
1,ATL,161,6056,5363,1307,2333,549,67,43,32,2,239,779,269,20,0.309574,0.185073
2,AZ,162,6144,5489,1297,2099,537,54,28,32,4,144,814,308,31,0.302096,0.1436
3,BAL,162,5983,5420,1296,2177,451,65,31,14,2,195,820,266,15,0.297254,0.160065
4,BOS,162,6122,5495,1434,2467,512,61,42,10,2,219,862,330,23,0.321112,0.182647


Unnamed: 0,batting_team,games,PA,AB,H,TB,BB,HBP,SF,SH,CI,HR,_1B,_2B,_3B,mean_OBP,mean_ISO
0,ATH,162,5863,5314,1147,1837,433,59,33,22,2,137,746,249,15,0.27439,0.12516
1,ATL,162,6082,5509,1394,2443,470,66,36,1,0,243,842,298,11,0.31031,0.187164
2,AZ,162,6027,5351,1232,2061,531,60,50,31,4,173,773,262,24,0.297268,0.152505
3,BAL,162,6049,5429,1281,2119,476,83,43,12,6,171,810,275,25,0.298503,0.150114
4,BOS,162,6144,5539,1427,2268,478,63,50,12,2,155,908,352,12,0.3123,0.148461


Unnamed: 0,batting_team,games,PA,AB,H,TB,BB,HBP,SF,SH,CI,HR,_1B,_2B,_3B,mean_OBP,mean_ISO
0,ATH,162,5966,5311,1187,1967,498,85,36,28,8,171,770,225,21,0.289625,0.14063
1,ATL,162,6249,5597,1543,2803,538,67,43,2,2,307,920,293,23,0.336834,0.221362
2,AZ,162,6124,5436,1359,2219,540,58,50,36,4,166,875,274,44,0.315274,0.155614
3,BAL,162,6123,5495,1399,2313,512,45,47,21,3,183,879,309,28,0.311707,0.16233
4,BOS,162,6174,5562,1437,2360,486,71,35,11,9,182,897,339,19,0.317211,0.162799


Unnamed: 0,batting_team,games,PA,AB,H,TB,BB,HBP,SF,SH,CI,HR,_1B,_2B,_3B,mean_OBP,mean_ISO
0,ATH,162,6033,5431,1266,2132,493,53,37,19,0,196,811,240,19,0.293948,0.154202
1,ATL,162,6075,5481,1333,2275,485,58,39,9,3,213,832,273,15,0.303635,0.167732
2,AZ,162,6282,5520,1450,2426,569,84,66,34,9,211,933,269,37,0.329052,0.172021
3,BAL,162,6176,5567,1391,2424,489,64,45,6,5,235,861,262,33,0.307583,0.181514
4,BOS,162,6192,5577,1404,2357,493,73,40,7,2,194,869,311,30,0.310759,0.166145


Unnamed: 0,batting_team,games,PA,AB,H,TB,BB,HBP,SF,SH,CI,HR,_1B,_2B,_3B,mean_OBP,mean_ISO
0,ATH,162,6151,5547,1403,2388,502,44,35,18,5,219,872,296,16,0.310548,0.172463
1,ATL,162,6186,5508,1349,2200,575,51,36,15,1,190,897,243,19,0.312601,0.151737
2,AZ,162,6210,5480,1377,2372,545,81,64,37,3,214,848,277,38,0.319795,0.178207
3,BAL,162,6020,5416,1273,2135,484,75,41,4,0,191,812,251,19,0.2974,0.154252
4,BOS,162,6206,5562,1414,2344,518,72,41,13,0,186,880,324,24,0.316414,0.162508


### Filling in Missing Data

Next, we fill in the missing rolling `OBP` and `ISO` values that occur in each team’s first game of the season. Since there is no prior game history to compute rolling metrics at that point, we replace those missing values with the team’s prior season average to ensure a complete and consistent feature set for modeling.


In [69]:
for y in range(2022, 2026):
    df_name = f"team_game_batting_{y}"
    means_name = f"team_batting_means_{y-1}"

    df = globals().get(df_name)
    prior_means = globals().get(means_name)

    if df is None or prior_means is None:
        print(f"{y}: missing {df_name} or {means_name}")
        continue

    globals()[df_name] = fill_missing_rolling_from_prior_year(df, prior_means)

    print(f"{y}: filled missing rolling OBP/ISO in {df_name} using {means_name}")


2022: filled missing rolling OBP/ISO in team_game_batting_2022 using team_batting_means_2021
2023: filled missing rolling OBP/ISO in team_game_batting_2023 using team_batting_means_2022
2024: filled missing rolling OBP/ISO in team_game_batting_2024 using team_batting_means_2023
2025: filled missing rolling OBP/ISO in team_game_batting_2025 using team_batting_means_2024


In [70]:
display(HTML("<h4>Season 2022</h4>")); display(team_game_batting_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(team_game_batting_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(team_game_batting_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(team_game_batting_2025.head(5))


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,661131,2022-04-08,ATH,PHI,ATH,35,34,6,13,0,...,,,,,,,0.310238,0.165121,0.310238,0.165121
1,661130,2022-04-09,ATH,PHI,ATH,31,30,5,6,1,...,0.0,1.0,0.0,2.0,1.0,0.0,0.2,0.205882,0.2,0.205882
2,661129,2022-04-10,ATH,PHI,ATH,39,35,9,14,3,...,1.0,1.0,0.0,2.0,2.0,0.0,0.19697,0.125,0.19697,0.125
3,661915,2022-04-11,ATH,TB,ATH,45,41,13,29,3,...,4.0,2.0,0.0,3.0,4.0,0.0,0.247619,0.131313,0.247619,0.131313
4,661944,2022-04-12,ATH,TB,ATH,45,35,8,12,9,...,7.0,3.0,0.0,7.0,6.0,1.0,0.313043,0.207547,0.286667,0.207143


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,718769,2023-03-30,ATH,ATH,LAA,33,29,6,8,4,...,,,,,,,0.27439,0.12516,0.27439,0.12516
1,718757,2023-04-01,ATH,ATH,LAA,34,30,5,8,3,...,4.0,0.0,0.0,0.0,2.0,0.0,0.30303,0.068966,0.30303,0.068966
2,718734,2023-04-02,ATH,ATH,LAA,35,31,5,6,3,...,7.0,0.0,0.0,1.0,2.0,0.0,0.272727,0.084746,0.272727,0.084746
3,718721,2023-04-03,ATH,ATH,CLE,46,43,14,26,3,...,10.0,1.0,0.0,1.0,3.0,0.0,0.267327,0.066667,0.267327,0.066667
4,718707,2023-04-04,ATH,ATH,CLE,37,32,6,9,4,...,13.0,1.0,0.0,4.0,6.0,0.0,0.298246,0.153846,0.29932,0.135338


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,745687,2024-03-28,ATH,ATH,CLE,31,30,4,5,1,...,,,,,,,0.289625,0.14063,0.289625,0.14063
1,745682,2024-03-29,ATH,ATH,CLE,36,32,8,16,2,...,1.0,0.0,0.0,0.0,1.0,0.0,0.16129,0.033333,0.16129,0.033333
2,745684,2024-03-30,ATH,ATH,CLE,40,32,6,7,6,...,3.0,0.0,2.0,2.0,3.0,0.0,0.223881,0.145161,0.223881,0.145161
3,745683,2024-03-31,ATH,ATH,CLE,36,30,9,14,4,...,9.0,2.0,2.0,2.0,4.0,0.0,0.271028,0.106383,0.271028,0.106383
4,745675,2024-04-01,ATH,ATH,BOS,31,31,4,5,0,...,13.0,3.0,2.0,2.0,5.0,2.0,0.342342,0.148936,0.302817,0.120968


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,778547,2025-03-27,ATH,SEA,ATH,30,29,3,9,1,...,,,,,,,0.293948,0.154202,0.293948,0.154202
1,778541,2025-03-28,ATH,SEA,ATH,44,38,12,23,5,...,1.0,0.0,0.0,2.0,0.0,0.0,0.133333,0.206897,0.133333,0.206897
2,778521,2025-03-29,ATH,SEA,ATH,38,36,9,12,2,...,6.0,1.0,0.0,5.0,2.0,0.0,0.297297,0.253731,0.297297,0.253731
3,778513,2025-03-30,ATH,SEA,ATH,34,30,4,8,3,...,8.0,1.0,0.0,6.0,2.0,0.0,0.294643,0.194175,0.294643,0.194175
4,778501,2025-03-31,ATH,ATH,CHC,38,36,10,15,2,...,11.0,1.0,0.0,7.0,3.0,0.0,0.313043,0.173077,0.275862,0.180451


### Dropping rolling sum columns

After computing rolling `OBP` and `ISO`, we no longer need the intermediate rolling sum columns (e.g., rolling `AB`, `H`, `BB`, `HBP`, `SF`, `HR`, `2B`, `3B`). This step removes those columns from each `team_game_batting_YYYY` DataFrame to keep the dataset smaller and focused on the final features used for modeling.


In [71]:
'''
Make function or combine with  preprocessing.schema drop_rolled_component_cols function
'''

cols_to_drop = [
    "roll_3D_AB",
    "roll_3D_H",
    "roll_3D_BB",
    "roll_3D_HBP",
    "roll_3D_SF",
    "roll_3D_HR",
    "roll_3D__2B",
    "roll_3D__3B",
    "roll_7D_AB",
    "roll_7D_H",
    "roll_7D_BB",
    "roll_7D_HBP",
    "roll_7D_SF",
    "roll_7D_HR",
    "roll_7D__2B",
    "roll_7D__3B",
]

for y in range(2022, 2026):
    name = f"team_game_batting_{y}"
    df = globals().get(name)

    if df is None:
        print(f"{y}: missing {name}")
        continue

    globals()[name] = df.drop(columns=cols_to_drop, errors="ignore")

    print(f"{y}: dropped rolling sum columns from {name}")

2022: dropped rolling sum columns from team_game_batting_2022
2023: dropped rolling sum columns from team_game_batting_2023
2024: dropped rolling sum columns from team_game_batting_2024
2025: dropped rolling sum columns from team_game_batting_2025


In [72]:
display(HTML("<h4>Season 2022</h4>")); display(team_game_batting_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(team_game_batting_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(team_game_batting_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(team_game_batting_2025.head(5))


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,CI,HR,_1B,_2B,_3B,is_home_batting,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,661131,2022-04-08,ATH,PHI,ATH,35,34,6,13,0,...,0,2,3,1,0,False,0.310238,0.165121,0.310238,0.165121
1,661130,2022-04-09,ATH,PHI,ATH,31,30,5,6,1,...,0,0,4,1,0,False,0.2,0.205882,0.2,0.205882
2,661129,2022-04-10,ATH,PHI,ATH,39,35,9,14,3,...,0,1,6,2,0,False,0.19697,0.125,0.19697,0.125
3,661915,2022-04-11,ATH,TB,ATH,45,41,13,29,3,...,0,4,6,2,1,False,0.247619,0.131313,0.247619,0.131313
4,661944,2022-04-12,ATH,TB,ATH,45,35,8,12,9,...,0,1,6,1,0,False,0.313043,0.207547,0.286667,0.207143


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,CI,HR,_1B,_2B,_3B,is_home_batting,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,718769,2023-03-30,ATH,ATH,LAA,33,29,6,8,4,...,0,0,4,2,0,True,0.27439,0.12516,0.27439,0.12516
1,718757,2023-04-01,ATH,ATH,LAA,34,30,5,8,3,...,1,1,4,0,0,True,0.30303,0.068966,0.30303,0.068966
2,718734,2023-04-02,ATH,ATH,LAA,35,31,5,6,3,...,0,0,4,1,0,True,0.272727,0.084746,0.272727,0.084746
3,718721,2023-04-03,ATH,ATH,CLE,46,43,14,26,3,...,0,3,8,3,0,True,0.267327,0.066667,0.267327,0.066667
4,718707,2023-04-04,ATH,ATH,CLE,37,32,6,9,4,...,0,0,3,3,0,True,0.298246,0.153846,0.29932,0.135338


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,CI,HR,_1B,_2B,_3B,is_home_batting,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,745687,2024-03-28,ATH,ATH,CLE,31,30,4,5,1,...,0,0,3,1,0,True,0.289625,0.14063,0.289625,0.14063
1,745682,2024-03-29,ATH,ATH,CLE,36,32,8,16,2,...,0,2,4,2,0,True,0.16129,0.033333,0.16129,0.033333
2,745684,2024-03-30,ATH,ATH,CLE,40,32,6,7,6,...,0,0,5,1,0,True,0.223881,0.145161,0.223881,0.145161
3,745683,2024-03-31,ATH,ATH,CLE,36,30,9,14,4,...,0,0,6,1,2,True,0.271028,0.106383,0.271028,0.106383
4,745675,2024-04-01,ATH,ATH,BOS,31,31,4,5,0,...,0,0,3,1,0,True,0.342342,0.148936,0.302817,0.120968


Unnamed: 0,game_id,game_date,batting_team,home_team,away_team,PA,AB,H,TB,BB,...,CI,HR,_1B,_2B,_3B,is_home_batting,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,778547,2025-03-27,ATH,SEA,ATH,30,29,3,9,1,...,0,2,1,0,0,False,0.293948,0.154202,0.293948,0.154202
1,778541,2025-03-28,ATH,SEA,ATH,44,38,12,23,5,...,0,3,7,2,0,False,0.133333,0.206897,0.133333,0.206897
2,778521,2025-03-29,ATH,SEA,ATH,38,36,9,12,2,...,0,1,8,0,0,False,0.297297,0.253731,0.297297,0.253731
3,778513,2025-03-30,ATH,SEA,ATH,34,30,4,8,3,...,0,1,2,1,0,False,0.294643,0.194175,0.294643,0.194175
4,778501,2025-03-31,ATH,ATH,CHC,38,36,10,15,2,...,0,1,7,2,0,True,0.313043,0.173077,0.275862,0.180451


### Home and Away

This step converts the team-level rolling batting table (which contains two rows per game — one per batting team) into a single game-level matchup table with **one row per `game_id`**.

Because `team_game_batting_df` already includes `home_team` and `away_team`, we identify:

- the **home batting row** as `batting_team == home_team`
- the **away batting row** as `batting_team == away_team`

We then:

1. Split the table into home and away DataFrames.
2. Rename rolling feature columns with `_home` and `_away` suffixes  
   (e.g., `roll_3D_OBP_home`, `roll_3D_OBP_away`).
3. Merge the two sides back together on `game_id`, `game_date`, `home_team`, and `away_team`.

The result is a clean, one-row-per-game matchup table that is:

- Robust to doubleheaders  
- Free of fragile string parsing  
- Ready for modeling  
- Suitable for computing home-minus-away deltas


In [73]:
feat_cols = ["roll_3D_OBP", "roll_3D_ISO", "roll_7D_OBP", "roll_7D_ISO"]

for y in range(2022, 2026):
    team_name = f"team_game_batting_{y}"
    out_name  = f"games_batting_rolls_{y}"

    team_df = globals().get(team_name)
    if team_df is None:
        print(f"{y}: missing {team_name}")
        continue

    # Split (keeps batting_team as requested)
    home_df, away_df = split_home_away_team_game(
        team_df,
        feat_cols=feat_cols,
        keep_batting_team=True,
    )

    # (Optional) store split dfs for inspection/debugging
    globals()[f"home_batting_{y}"] = home_df
    globals()[f"away_batting_{y}"] = away_df

    # Combine into one row per game
    out_df = combine_home_away_by_game(
        home_df,
        away_df,
        drop_batting_team_after_merge=False,  # keeps batting_team_x/y in output
    )

    globals()[out_name] = out_df

    # Quick NA check on the rolls
    roll_cols_out = [f"{c}_home" for c in feat_cols] + [f"{c}_away" for c in feat_cols]
    na_rows = int(out_df[roll_cols_out].isna().any(axis=1).sum())

    print(
        f"{y}: created {out_name} "
        f"({len(out_df):,} rows; NA rows in rolls: {na_rows:,})"
    )


2022: created games_batting_rolls_2022 (2,430 rows; NA rows in rolls: 0)
2023: created games_batting_rolls_2023 (2,430 rows; NA rows in rolls: 0)
2024: created games_batting_rolls_2024 (2,429 rows; NA rows in rolls: 0)
2025: created games_batting_rolls_2025 (2,430 rows; NA rows in rolls: 0)


In [74]:
display(HTML("<h4>Season 2022</h4>")); display(games_batting_rolls_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(games_batting_rolls_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(games_batting_rolls_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(games_batting_rolls_2025.head(5))


Unnamed: 0,game_id,game_date,home_team,away_team,batting_team_x,roll_3D_OBP_home,roll_3D_ISO_home,roll_7D_OBP_home,roll_7D_ISO_home,batting_team_y,roll_3D_OBP_away,roll_3D_ISO_away,roll_7D_OBP_away,roll_7D_ISO_away
0,661042,2022-04-07,LAA,HOU,LAA,0.302841,0.158766,0.302841,0.158766,HOU,0.330001,0.171959,0.330001,0.171959
1,661577,2022-04-07,ATL,CIN,ATL,0.309574,0.185073,0.309574,0.185073,CIN,0.320802,0.178799,0.320802,0.178799
2,662021,2022-04-07,STL,PIT,STL,0.3066,0.164839,0.3066,0.164839,PIT,0.301869,0.126334,0.301869,0.126334
3,662571,2022-04-07,WSH,NYM,WSH,0.329574,0.155429,0.329574,0.155429,NYM,0.308161,0.147429,0.308161,0.147429
4,662766,2022-04-07,KC,CLE,KC,0.30059,0.145212,0.30059,0.145212,CLE,0.294697,0.164661,0.294697,0.164661


Unnamed: 0,game_id,game_date,home_team,away_team,batting_team_x,roll_3D_OBP_home,roll_3D_ISO_home,roll_7D_OBP_home,roll_7D_ISO_home,batting_team_y,roll_3D_OBP_away,roll_3D_ISO_away,roll_7D_OBP_away,roll_7D_ISO_away
0,718767,2023-03-30,SEA,CLE,SEA,0.307201,0.156428,0.307201,0.156428,CLE,0.309944,0.125657,0.309944,0.125657
1,718768,2023-03-30,HOU,CWS,HOU,0.312216,0.17135,0.312216,0.17135,CWS,0.303165,0.128579,0.303165,0.128579
2,718769,2023-03-30,ATH,LAA,ATH,0.27439,0.12516,0.27439,0.12516,LAA,0.291238,0.152138,0.291238,0.152138
3,718770,2023-03-30,LAD,AZ,LAD,0.326771,0.182422,0.326771,0.182422,AZ,0.297268,0.152505,0.297268,0.152505
4,718772,2023-03-30,STL,TOR,STL,0.317426,0.16383,0.317426,0.16383,TOR,0.321388,0.164502,0.321388,0.164502


Unnamed: 0,game_id,game_date,home_team,away_team,batting_team_x,roll_3D_OBP_home,roll_3D_ISO_home,roll_7D_OBP_home,roll_7D_ISO_home,batting_team_y,roll_3D_OBP_away,roll_3D_ISO_away,roll_7D_OBP_away,roll_7D_ISO_away
0,745444,2024-03-20,SD,LAD,SD,0.322325,0.164487,0.322325,0.164487,LAD,0.332953,0.194873,0.332953,0.194873
1,746175,2024-03-21,LAD,SD,LAD,0.377778,0.0,0.377778,0.0,SD,0.235294,0.0,0.235294,0.0
2,745039,2024-03-28,TEX,CHC,TEX,0.329514,0.186171,0.329514,0.186171,CHC,0.322298,0.162632,0.322298,0.162632
3,745116,2024-03-28,TB,TOR,TB,0.3242,0.182664,0.3242,0.182664,TOR,0.32075,0.156958,0.32075,0.156958
4,745283,2024-03-28,SEA,BOS,SEA,0.315463,0.167137,0.315463,0.167137,BOS,0.317211,0.162799,0.317211,0.162799


Unnamed: 0,game_id,game_date,home_team,away_team,batting_team_x,roll_3D_OBP_home,roll_3D_ISO_home,roll_7D_OBP_home,roll_7D_ISO_home,batting_team_y,roll_3D_OBP_away,roll_3D_ISO_away,roll_7D_OBP_away,roll_7D_ISO_away
0,778563,2025-03-18,CHC,LAD,CHC,0.308032,0.146972,0.308032,0.146972,LAD,0.327347,0.185742,0.327347,0.185742
1,778564,2025-03-19,CHC,LAD,CHC,0.15625,0.033333,0.15625,0.033333,LAD,0.357143,0.058824,0.357143,0.058824
2,778545,2025-03-27,SD,ATL,SD,0.31605,0.153786,0.31605,0.153786,ATL,0.303635,0.167732,0.303635,0.167732
3,778546,2025-03-27,LAD,DET,LAD,0.368421,0.4,0.368421,0.4,DET,0.291766,0.14627,0.291766,0.14627
4,778547,2025-03-27,SEA,ATH,SEA,0.304815,0.1484,0.304815,0.1484,ATH,0.293948,0.154202,0.293948,0.154202


### Calculating Difference Between Home and Away

This step creates a new game-level table of **batting deltas** by subtracting the away team’s rolling metrics from the home team’s rolling metrics (**home − away**). It expects the game-level batting rolls DataFrame to contain paired columns such as `roll_3D_OBP_home` / `roll_3D_OBP_away` (and similarly for ISO and other rolling windows).

The output retains `game_id`, `game_date`, `home_team`, and `away_team`, and adds delta columns named:

- `Δroll_3D_OBP`
- `Δroll_3D_ISO`
- `Δroll_7D_OBP`
- `Δroll_7D_ISO`

These delta features represent the relative offensive form of the home team compared to the away team and are used as matchup-level inputs for modeling.


In [75]:
for y in range(2022, 2026):
    in_name = f"games_batting_rolls_{y}"
    out_name = f"game_batting_deltas_{y}"

    df = globals().get(in_name)
    if df is None:
        print(f"{y}: missing {in_name}")
        continue

    globals()[out_name] = make_batting_delta_df(df)
    print(f"{y}: created {out_name} ({len(globals()[out_name]):,} rows)")


2022: created game_batting_deltas_2022 (2,430 rows)
2023: created game_batting_deltas_2023 (2,430 rows)
2024: created game_batting_deltas_2024 (2,429 rows)
2025: created game_batting_deltas_2025 (2,430 rows)


In [76]:
display(HTML("<h4>Season 2022</h4>")); display(game_batting_deltas_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(game_batting_deltas_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(game_batting_deltas_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(game_batting_deltas_2025.head(5))


Unnamed: 0,game_id,game_date,home_team,away_team,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO
0,661042,2022-04-07,LAA,HOU,-0.02716,-0.013193,-0.02716,-0.013193
1,661577,2022-04-07,ATL,CIN,-0.011229,0.006274,-0.011229,0.006274
2,662021,2022-04-07,STL,PIT,0.004731,0.038504,0.004731,0.038504
3,662571,2022-04-07,WSH,NYM,0.021413,0.008,0.021413,0.008
4,662766,2022-04-07,KC,CLE,0.005893,-0.01945,0.005893,-0.01945


Unnamed: 0,game_id,game_date,home_team,away_team,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO
0,718767,2023-03-30,SEA,CLE,-0.002744,0.030771,-0.002744,0.030771
1,718768,2023-03-30,HOU,CWS,0.009051,0.042771,0.009051,0.042771
2,718769,2023-03-30,ATH,LAA,-0.016848,-0.026978,-0.016848,-0.026978
3,718770,2023-03-30,LAD,AZ,0.029503,0.029917,0.029503,0.029917
4,718772,2023-03-30,STL,TOR,-0.003963,-0.000672,-0.003963,-0.000672


Unnamed: 0,game_id,game_date,home_team,away_team,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO
0,745444,2024-03-20,SD,LAD,-0.010627,-0.030386,-0.010627,-0.030386
1,746175,2024-03-21,LAD,SD,0.142484,0.0,0.142484,0.0
2,745039,2024-03-28,TEX,CHC,0.007217,0.023539,0.007217,0.023539
3,745116,2024-03-28,TB,TOR,0.003449,0.025706,0.003449,0.025706
4,745283,2024-03-28,SEA,BOS,-0.001748,0.004338,-0.001748,0.004338


Unnamed: 0,game_id,game_date,home_team,away_team,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO
0,778563,2025-03-18,CHC,LAD,-0.019315,-0.038771,-0.019315,-0.038771
1,778564,2025-03-19,CHC,LAD,-0.200893,-0.02549,-0.200893,-0.02549
2,778545,2025-03-27,SD,ATL,0.012415,-0.013947,0.012415,-0.013947
3,778546,2025-03-27,LAD,DET,0.076655,0.25373,0.076655,0.25373
4,778547,2025-03-27,SEA,ATH,0.010867,-0.005802,0.010867,-0.005802


### Checking for Missing Values

In [77]:
for year, df in {
    2022: game_batting_deltas_2022,
    2023: game_batting_deltas_2023,
    2024: game_batting_deltas_2024,
    2025: game_batting_deltas_2025,
}.items():
    total_na = int(df.isna().sum().sum())
    print(f"\n{year}")
    print(f"Total missing cells: {total_na}")
    
    assert total_na == 0, f"{year}: Missing values found in game_batting_deltas_{year}"  


2022
Total missing cells: 0

2023
Total missing cells: 0

2024
Total missing cells: 0

2025
Total missing cells: 0


### Game Numbers

**TODO**: Make a function that can work wtih all other dataframes, and rather than return the dataframe, return True or False and have an assert statement

In [78]:
# Validation

batting_deltas = {
    2022: game_batting_deltas_2022,
    2023: game_batting_deltas_2023,
    2024: game_batting_deltas_2024,
    2025: game_batting_deltas_2025,
}

rows = []

for year, df in batting_deltas.items():
    
    games = df[["game_id", "home_team", "away_team"]].drop_duplicates()
    
    long = pd.concat(
        [
            games[["game_id", "home_team"]].rename(columns={"home_team": "team"}),
            games[["game_id", "away_team"]].rename(columns={"away_team": "team"}),
        ],
        ignore_index=True,
    ).drop_duplicates()
    
    long["season"] = year
    rows.append(long)

batting_games_per_team = (
    pd.concat(rows, ignore_index=True)
      .groupby(["team", "season"])["game_id"]
      .nunique()
      .unstack("season")
      .sort_index()
)

batting_games_per_team


season,2022,2023,2024,2025
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ATH,162,162,162,162
ATL,162,162,162,162
AZ,162,162,162,162
BAL,162,162,162,162
BOS,162,162,162,162
CHC,162,162,162,162
CIN,162,162,162,162
CLE,162,162,161,162
COL,162,162,162,162
CWS,162,162,162,162


## Fielding

**TODO**: Add in more robust explaination


$$\text{BIP Outs Rate} = 1 - \frac{\text{BIP Hits}}{\text{BIP}}$$

where

$$\text{BIP} = \text{PA} - \text{K} - \text{BB} - \text{HBP} - \text{HR} - \text{SF}$$

$$\text{BIP Hits} = \text{H} - \text{HR}$$


In [79]:
# Apply to each season PA dataframe
for y in range(2021, 2026):
    src_name = f"pa_{y}"
    dst_name = f"pa_field_{y}"

    df = globals().get(src_name)
    if df is None:
        print(f"{src_name}: (not found)")
        continue

    globals()[dst_name] = add_fielding_indicators(df.copy())
    print(f"{dst_name}: fielding indicators added (from {src_name})")

pa_field_2021: fielding indicators added (from pa_2021)
pa_field_2022: fielding indicators added (from pa_2022)
pa_field_2023: fielding indicators added (from pa_2023)
pa_field_2024: fielding indicators added (from pa_2024)
pa_field_2025: fielding indicators added (from pa_2025)


In [80]:
display(HTML("<h4>Season 2021</h4>")); display(pa_field_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(pa_field_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(pa_field_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(pa_field_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(pa_field_2025.head(5))


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,...,is_starter,starter_full_game,is_bb,is_hbp,is_sf,is_k,is_hr,is_h,is_bip_hit,is_bip
3,632169,2021-04-10,SF,COL,COL,1,Bot,2,0,0,...,1,0,0,0,0,0,0,0,0,1
14,632169,2021-04-10,SF,COL,COL,1,Bot,7,0,0,...,1,0,1,0,0,0,0,0,0,0
15,632169,2021-04-10,SF,COL,COL,1,Bot,7,0,0,...,1,0,0,0,0,0,0,1,1,1
19,632169,2021-04-10,SF,COL,COL,1,Bot,4,1,0,...,1,0,0,0,0,0,0,0,0,1
27,632169,2021-04-10,SF,COL,COL,1,Bot,8,2,0,...,1,0,0,0,0,0,0,0,0,1


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,...,is_starter,starter_full_game,is_bb,is_hbp,is_sf,is_k,is_hr,is_h,is_bip_hit,is_bip
0,661032,2022-04-26,LAA,CLE,CLE,1,Bot,5,0,0,...,1,0,0,0,0,0,0,0,0,1
1,661032,2022-04-26,LAA,CLE,CLE,1,Bot,1,1,0,...,1,0,0,0,0,0,0,0,0,1
2,661032,2022-04-26,LAA,CLE,CLE,1,Bot,5,2,0,...,1,0,0,0,0,1,0,0,0,0
3,661032,2022-04-26,LAA,CLE,CLE,2,Bot,2,0,0,...,1,0,0,0,0,0,0,0,0,1
4,661032,2022-04-26,LAA,CLE,CLE,2,Bot,1,1,0,...,1,0,0,0,0,0,0,1,1,1


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,...,is_starter,starter_full_game,is_bb,is_hbp,is_sf,is_k,is_hr,is_h,is_bip_hit,is_bip
0,716352,2023-10-01,KC,NYY,KC,1,Top,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,716352,2023-10-01,KC,NYY,KC,1,Top,2,0,0,...,1,0,0,0,0,0,0,1,1,1
7,716352,2023-10-01,KC,NYY,KC,1,Top,3,0,0,...,1,0,0,0,0,0,0,1,1,1
15,716352,2023-10-01,KC,NYY,KC,1,Top,10,0,0,...,1,0,0,0,0,0,0,1,1,1
18,716352,2023-10-01,KC,NYY,KC,1,Top,3,2,0,...,1,0,0,0,0,0,0,0,0,1


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,...,is_starter,starter_full_game,is_bb,is_hbp,is_sf,is_k,is_hr,is_h,is_bip_hit,is_bip
0,744795,2024-09-25,WSH,KC,KC,1,Bot,4,0,0,...,1,0,0,0,0,0,0,0,0,1
1,744795,2024-09-25,WSH,KC,KC,1,Bot,6,1,0,...,1,0,0,0,0,1,0,0,0,0
2,744795,2024-09-25,WSH,KC,KC,1,Bot,3,2,0,...,1,0,0,0,0,0,0,0,0,1
3,744795,2024-09-25,WSH,KC,KC,2,Bot,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,744795,2024-09-25,WSH,KC,KC,2,Bot,4,0,0,...,1,0,1,0,0,0,0,0,0,0


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,...,is_starter,starter_full_game,is_bb,is_hbp,is_sf,is_k,is_hr,is_h,is_bip_hit,is_bip
4,776135,2025-09-28,LAA,HOU,HOU,1,Bot,5,0,0,...,1,0,0,0,0,0,0,0,0,1
11,776135,2025-09-28,LAA,HOU,HOU,1,Bot,4,1,1,...,1,0,0,0,0,1,0,0,0,0
14,776135,2025-09-28,LAA,HOU,HOU,1,Bot,6,1,0,...,1,0,0,0,0,0,1,1,0,0
18,776135,2025-09-28,LAA,HOU,HOU,1,Bot,4,2,1,...,1,0,0,0,0,0,0,0,0,1
22,776135,2025-09-28,LAA,HOU,HOU,2,Bot,4,0,1,...,1,0,0,0,0,0,0,0,0,1


### Aggregating to the Team Level

In [81]:
for y in range(2021, 2026):
    src = globals().get(f"pa_field_{y}")
    if src is None:
        print(f"pa_field_{y}: (not found)")
        continue

    globals()[f"game_fielding_{y}"] = make_game_fielding_bip_counts(src)
    print(f"game_fielding_{y}: created")

game_fielding_2021: created
game_fielding_2022: created
game_fielding_2023: created
game_fielding_2024: created
game_fielding_2025: created


In [82]:
display(HTML("<h4>Season 2021</h4>")); display(game_fielding_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(game_fielding_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(game_fielding_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(game_fielding_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(game_fielding_2025.head(5))


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,BIP,BIP_H
0,632169,2021-04-10,SF,COL,COL,22,4
1,632169,2021-04-10,SF,COL,SF,21,8
2,632170,2021-04-11,CWS,KC,CWS,20,6
3,632170,2021-04-11,CWS,KC,KC,26,4
4,632188,2021-04-11,SF,COL,COL,23,6


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,BIP,BIP_H
0,661032,2022-04-26,LAA,CLE,CLE,23,7
1,661032,2022-04-26,LAA,CLE,LAA,19,4
2,661033,2022-04-24,LAA,BAL,BAL,25,6
3,661033,2022-04-24,LAA,BAL,LAA,20,7
4,661034,2022-04-25,LAA,CLE,CLE,22,6


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,BIP,BIP_H
0,716352,2023-10-01,KC,NYY,KC,25,7
1,716352,2023-10-01,KC,NYY,NYY,22,8
2,716353,2023-10-01,STL,CIN,CIN,22,9
3,716353,2023-10-01,STL,CIN,STL,20,8
4,716354,2023-10-01,ATL,WSH,ATL,40,15


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,BIP,BIP_H
0,744795,2024-09-25,WSH,KC,KC,22,2
1,744795,2024-09-25,WSH,KC,WSH,27,7
2,744796,2024-09-26,WSH,KC,KC,27,6
3,744796,2024-09-26,WSH,KC,WSH,31,7
4,744797,2024-09-27,WSH,PHI,PHI,34,15


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,BIP,BIP_H
0,776135,2025-09-28,LAA,HOU,HOU,24,5
1,776135,2025-09-28,LAA,HOU,LAA,24,7
2,776136,2025-09-28,SD,AZ,AZ,25,10
3,776136,2025-09-28,SD,AZ,SD,26,10
4,776137,2025-09-28,SF,COL,COL,22,5


### Creating Copies

This will be used for missing data. 

In [83]:
for y in range(2021, 2026):
    src_name = f"game_fielding_{y}"
    dst_name = f"game_fielding_{y}_copy"

    df = globals().get(src_name)
    if df is None:
        print(f"{src_name}: (not found)")
        continue

    globals()[dst_name] = df.copy()
    print(f"{dst_name}: copy created from {src_name}")

game_fielding_2021_copy: copy created from game_fielding_2021
game_fielding_2022_copy: copy created from game_fielding_2022
game_fielding_2023_copy: copy created from game_fielding_2023
game_fielding_2024_copy: copy created from game_fielding_2024
game_fielding_2025_copy: copy created from game_fielding_2025


### Adding Rolling Features

In [84]:
for y in range(2021, 2026):
    name = f"game_fielding_{y}"
    df = globals().get(name)
    if df is None:
        print(f"{name}: (not found)")
        continue

    globals()[name] = add_rolling_bip_features(df, windows=(3, 7))
    print(f"{name}: shifted rolling features added")

game_fielding_2021: shifted rolling features added
game_fielding_2022: shifted rolling features added
game_fielding_2023: shifted rolling features added
game_fielding_2024: shifted rolling features added
game_fielding_2025: shifted rolling features added


In [85]:
display(HTML("<h4>Season 2021</h4>")); display(game_fielding_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(game_fielding_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(game_fielding_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(game_fielding_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(game_fielding_2025.head(5))


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,BIP,BIP_H,roll_3G_BIP,roll_7G_BIP,roll_3G_BIP_H,roll_7G_BIP_H,roll_3G_BIP_out_rate,roll_7G_BIP_out_rate
0,634640,2021-04-01,ATH,HOU,ATH,26,7,,,,,,
1,634605,2021-04-02,ATH,HOU,ATH,27,12,26.0,26.0,7.0,7.0,0.730769,0.730769
2,634629,2021-04-03,ATH,HOU,ATH,31,12,53.0,53.0,19.0,19.0,0.641509,0.641509
3,634651,2021-04-04,ATH,HOU,ATH,28,8,84.0,84.0,31.0,31.0,0.630952,0.630952
4,634600,2021-04-05,ATH,LAD,ATH,29,12,86.0,112.0,32.0,39.0,0.627907,0.651786


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,BIP,BIP_H,roll_3G_BIP,roll_7G_BIP,roll_3G_BIP_H,roll_7G_BIP_H,roll_3G_BIP_out_rate,roll_7G_BIP_out_rate
0,661131,2022-04-08,PHI,ATH,ATH,24,10,,,,,,
1,661130,2022-04-09,PHI,ATH,ATH,19,4,24.0,24.0,10.0,10.0,0.583333,0.583333
2,661129,2022-04-10,PHI,ATH,ATH,21,2,43.0,43.0,14.0,14.0,0.674419,0.674419
3,661915,2022-04-11,TB,ATH,ATH,22,8,64.0,64.0,16.0,16.0,0.75,0.75
4,661944,2022-04-12,TB,ATH,ATH,29,10,62.0,86.0,14.0,24.0,0.774194,0.72093


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,BIP,BIP_H,roll_3G_BIP,roll_7G_BIP,roll_3G_BIP_H,roll_7G_BIP_H,roll_3G_BIP_out_rate,roll_7G_BIP_out_rate
0,718769,2023-03-30,ATH,LAA,ATH,23,5,,,,,,
1,718757,2023-04-01,ATH,LAA,ATH,28,10,23.0,23.0,5.0,5.0,0.782609,0.782609
2,718734,2023-04-02,ATH,LAA,ATH,28,8,51.0,51.0,15.0,15.0,0.705882,0.705882
3,718721,2023-04-03,ATH,CLE,ATH,35,15,79.0,79.0,23.0,23.0,0.708861,0.708861
4,718707,2023-04-04,ATH,CLE,ATH,22,7,91.0,114.0,33.0,38.0,0.637363,0.666667


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,BIP,BIP_H,roll_3G_BIP,roll_7G_BIP,roll_3G_BIP_H,roll_7G_BIP_H,roll_3G_BIP_out_rate,roll_7G_BIP_out_rate
0,745687,2024-03-28,ATH,CLE,ATH,28,11,,,,,,
1,745682,2024-03-29,ATH,CLE,ATH,24,8,28.0,28.0,11.0,11.0,0.607143,0.607143
2,745684,2024-03-30,ATH,CLE,ATH,34,14,52.0,52.0,19.0,19.0,0.634615,0.634615
3,745683,2024-03-31,ATH,CLE,ATH,30,7,86.0,86.0,33.0,33.0,0.616279,0.616279
4,745675,2024-04-01,ATH,BOS,ATH,24,9,88.0,116.0,29.0,40.0,0.670455,0.655172


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,BIP,BIP_H,roll_3G_BIP,roll_7G_BIP,roll_3G_BIP_H,roll_7G_BIP_H,roll_3G_BIP_out_rate,roll_7G_BIP_out_rate
0,778547,2025-03-27,SEA,ATH,ATH,17,3,,,,,,
1,778541,2025-03-28,SEA,ATH,ATH,18,5,17.0,17.0,3.0,3.0,0.823529,0.823529
2,778521,2025-03-29,SEA,ATH,ATH,18,5,35.0,35.0,8.0,8.0,0.771429,0.771429
3,778513,2025-03-30,SEA,ATH,ATH,18,4,53.0,53.0,13.0,13.0,0.754717,0.754717
4,778501,2025-03-31,ATH,CHC,ATH,37,17,54.0,71.0,14.0,17.0,0.740741,0.760563


## Missing Data

### Aggregation by Team for Each Year

In [86]:
for y in range(2021, 2026):
    src_name = f"game_fielding_{y}"
    dst_name = f"fielding_summary_{y}"

    df = globals().get(src_name)
    if df is None:
        print(f"{src_name}: (not found)")
        continue

    globals()[dst_name] = calculate_mean_bip_out_rate(df)
    print(f"{dst_name}: created from {src_name}")

fielding_summary_2021: created from game_fielding_2021
fielding_summary_2022: created from game_fielding_2022
fielding_summary_2023: created from game_fielding_2023
fielding_summary_2024: created from game_fielding_2024
fielding_summary_2025: created from game_fielding_2025


In [87]:
display(HTML("<h4>Season 2021</h4>")); display(fielding_summary_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(fielding_summary_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(fielding_summary_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(fielding_summary_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(fielding_summary_2025.head(5))


Unnamed: 0,fielding_team,games,BIP,BIP_H,mean_BIP_out_rate
0,ATH,162,4005,1171,0.715006
1,ATL,161,3720,1054,0.722455
2,AZ,162,4123,1248,0.70597
3,BAL,162,4083,1260,0.699979
4,BOS,162,3779,1233,0.682138


Unnamed: 0,fielding_team,games,BIP,BIP_H,mean_BIP_out_rate
0,ATH,162,4110,1199,0.714858
1,ATL,162,3721,1076,0.717884
2,AZ,162,4055,1154,0.721631
3,BAL,162,4119,1235,0.706041
4,BOS,162,3992,1225,0.7011


Unnamed: 0,fielding_team,games,BIP,BIP_H,mean_BIP_out_rate
0,ATH,162,4013,1251,0.694346
1,ATL,162,3799,1154,0.702312
2,AZ,162,3940,1178,0.711534
3,BAL,162,3917,1157,0.715174
4,BOS,162,3922,1208,0.700314


Unnamed: 0,fielding_team,games,BIP,BIP_H,mean_BIP_out_rate
0,ATH,162,4056,1216,0.708073
1,ATL,162,3723,1126,0.705803
2,AZ,162,4117,1287,0.696629
3,BAL,162,3940,1128,0.720828
4,BOS,162,4022,1174,0.715483


Unnamed: 0,fielding_team,games,BIP,BIP_H,mean_BIP_out_rate
0,ATH,162,3982,1165,0.716586
1,ATL,162,3862,1151,0.7074
2,AZ,162,4104,1207,0.71178
3,BAL,162,3984,1216,0.700567
4,BOS,162,3981,1169,0.710181


### Filling Missing Values

In [88]:
for y in range(2022, 2026):
    df_name = f"game_fielding_{y}"
    means_name = f"fielding_summary_{y-1}"

    df = globals().get(df_name)
    prior_means = globals().get(means_name)

    if df is None or prior_means is None:
        print(f"{y}: missing {df_name} or {means_name}")
        continue

    globals()[df_name] = fill_missing_rolling_bip_out_rate_from_prior_year(df, prior_means)

    print(f"{y}: filled missing rolling BIP out rate in {df_name} using {means_name}")

2022: filled missing rolling BIP out rate in game_fielding_2022 using fielding_summary_2021
2023: filled missing rolling BIP out rate in game_fielding_2023 using fielding_summary_2022
2024: filled missing rolling BIP out rate in game_fielding_2024 using fielding_summary_2023
2025: filled missing rolling BIP out rate in game_fielding_2025 using fielding_summary_2024


In [89]:
display(HTML("<h4>Season 2022</h4>")); display(game_fielding_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(game_fielding_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(game_fielding_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(game_fielding_2025.head(5))


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,BIP,BIP_H,roll_3G_BIP,roll_7G_BIP,roll_3G_BIP_H,roll_7G_BIP_H,roll_3G_BIP_out_rate,roll_7G_BIP_out_rate
0,661131,2022-04-08,PHI,ATH,ATH,24,10,,,,,0.715006,0.715006
1,661130,2022-04-09,PHI,ATH,ATH,19,4,24.0,24.0,10.0,10.0,0.583333,0.583333
2,661129,2022-04-10,PHI,ATH,ATH,21,2,43.0,43.0,14.0,14.0,0.674419,0.674419
3,661915,2022-04-11,TB,ATH,ATH,22,8,64.0,64.0,16.0,16.0,0.75,0.75
4,661944,2022-04-12,TB,ATH,ATH,29,10,62.0,86.0,14.0,24.0,0.774194,0.72093


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,BIP,BIP_H,roll_3G_BIP,roll_7G_BIP,roll_3G_BIP_H,roll_7G_BIP_H,roll_3G_BIP_out_rate,roll_7G_BIP_out_rate
0,718769,2023-03-30,ATH,LAA,ATH,23,5,,,,,0.714858,0.714858
1,718757,2023-04-01,ATH,LAA,ATH,28,10,23.0,23.0,5.0,5.0,0.782609,0.782609
2,718734,2023-04-02,ATH,LAA,ATH,28,8,51.0,51.0,15.0,15.0,0.705882,0.705882
3,718721,2023-04-03,ATH,CLE,ATH,35,15,79.0,79.0,23.0,23.0,0.708861,0.708861
4,718707,2023-04-04,ATH,CLE,ATH,22,7,91.0,114.0,33.0,38.0,0.637363,0.666667


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,BIP,BIP_H,roll_3G_BIP,roll_7G_BIP,roll_3G_BIP_H,roll_7G_BIP_H,roll_3G_BIP_out_rate,roll_7G_BIP_out_rate
0,745687,2024-03-28,ATH,CLE,ATH,28,11,,,,,0.694346,0.694346
1,745682,2024-03-29,ATH,CLE,ATH,24,8,28.0,28.0,11.0,11.0,0.607143,0.607143
2,745684,2024-03-30,ATH,CLE,ATH,34,14,52.0,52.0,19.0,19.0,0.634615,0.634615
3,745683,2024-03-31,ATH,CLE,ATH,30,7,86.0,86.0,33.0,33.0,0.616279,0.616279
4,745675,2024-04-01,ATH,BOS,ATH,24,9,88.0,116.0,29.0,40.0,0.670455,0.655172


Unnamed: 0,game_id,game_date,home_team,away_team,fielding_team,BIP,BIP_H,roll_3G_BIP,roll_7G_BIP,roll_3G_BIP_H,roll_7G_BIP_H,roll_3G_BIP_out_rate,roll_7G_BIP_out_rate
0,778547,2025-03-27,SEA,ATH,ATH,17,3,,,,,0.708073,0.708073
1,778541,2025-03-28,SEA,ATH,ATH,18,5,17.0,17.0,3.0,3.0,0.823529,0.823529
2,778521,2025-03-29,SEA,ATH,ATH,18,5,35.0,35.0,8.0,8.0,0.771429,0.771429
3,778513,2025-03-30,SEA,ATH,ATH,18,4,53.0,53.0,13.0,13.0,0.754717,0.754717
4,778501,2025-03-31,ATH,CHC,ATH,37,17,54.0,71.0,14.0,17.0,0.740741,0.760563


### Home and Away

In [90]:
for y in range(2022, 2026):
    src_name = f"game_fielding_{y}"
    dst_name = f"game_fielding_out_rates_{y}"

    df = globals().get(src_name)
    if df is None:
        print(f"{src_name}: (not found)")
        continue

    globals()[dst_name] = make_game_level_fielding_out_rate_wide(df)
    print(f"{dst_name}: created (1 row per game_id)")

game_fielding_out_rates_2022: created (1 row per game_id)
game_fielding_out_rates_2023: created (1 row per game_id)
game_fielding_out_rates_2024: created (1 row per game_id)
game_fielding_out_rates_2025: created (1 row per game_id)


In [91]:
display(HTML("<h4>Season 2022</h4>")); display(game_fielding_out_rates_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(game_fielding_out_rates_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(game_fielding_out_rates_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(game_fielding_out_rates_2025.head(5))


Unnamed: 0,game_id,game_date,home_team,away_team,roll_3G_BIP_out_rate_home,roll_3G_BIP_out_rate_away,roll_7G_BIP_out_rate_home,roll_7G_BIP_out_rate_away
0,661042,2022-04-07,LAA,HOU,0.697951,0.728005,0.697951,0.728005
1,661577,2022-04-07,ATL,CIN,0.722455,0.705826,0.722455,0.705826
2,662021,2022-04-07,STL,PIT,0.729999,0.701346,0.729999,0.701346
3,662571,2022-04-07,WSH,NYM,0.715413,0.719717,0.715413,0.719717
4,662766,2022-04-07,KC,CLE,0.702066,0.721383,0.702066,0.721383


Unnamed: 0,game_id,game_date,home_team,away_team,roll_3G_BIP_out_rate_home,roll_3G_BIP_out_rate_away,roll_7G_BIP_out_rate_home,roll_7G_BIP_out_rate_away
0,718767,2023-03-30,SEA,CLE,0.721822,0.72807,0.721822,0.72807
1,718768,2023-03-30,HOU,CWS,0.735539,0.707802,0.735539,0.707802
2,718769,2023-03-30,ATH,LAA,0.714858,0.727267,0.714858,0.727267
3,718770,2023-03-30,LAD,AZ,0.748132,0.721631,0.748132,0.721631
4,718772,2023-03-30,STL,TOR,0.717239,0.708182,0.717239,0.708182


Unnamed: 0,game_id,game_date,home_team,away_team,roll_3G_BIP_out_rate_home,roll_3G_BIP_out_rate_away,roll_7G_BIP_out_rate_home,roll_7G_BIP_out_rate_away
0,745444,2024-03-20,SD,LAD,0.718257,0.730052,0.718257,0.730052
1,746175,2024-03-21,LAD,SD,0.833333,0.730769,0.833333,0.730769
2,745039,2024-03-28,TEX,CHC,0.71517,0.710967,0.71517,0.710967
3,745116,2024-03-28,TB,TOR,0.722641,0.708758,0.722641,0.708758
4,745283,2024-03-28,SEA,BOS,0.7178,0.700314,0.7178,0.700314


Unnamed: 0,game_id,game_date,home_team,away_team,roll_3G_BIP_out_rate_home,roll_3G_BIP_out_rate_away,roll_7G_BIP_out_rate_home,roll_7G_BIP_out_rate_away
0,778563,2025-03-18,CHC,LAD,0.719644,0.727353,0.719644,0.727353
1,778564,2025-03-19,CHC,LAD,0.72,0.857143,0.72,0.857143
2,778545,2025-03-27,SD,ATL,0.714376,0.705803,0.714376,0.705803
3,778546,2025-03-27,LAD,DET,0.744186,0.723558,0.744186,0.723558
4,778547,2025-03-27,SEA,ATH,0.743414,0.708073,0.743414,0.708073


### Quality Check

In [92]:
for y in range(2022, 2026):
    df_name = f"game_fielding_out_rates_{y}"
    df = globals().get(df_name)

    if df is None:
        print(f"{df_name}: (not found)")
        continue

    assert_no_missing(df, df_name)

game_fielding_out_rates_2022: total missing cells = 0
game_fielding_out_rates_2023: total missing cells = 0
game_fielding_out_rates_2024: total missing cells = 0
game_fielding_out_rates_2025: total missing cells = 0


# CONTINUE HERE!

### Fielding Deltas

In [93]:
for y in range(2022, 2026):
    src_name = f"game_fielding_out_rates_{y}"
    dst_name = f"game_fielding_deltas_{y}"

    df = globals().get(src_name)
    if df is None:
        print(f"{src_name}: (not found)")
        continue

    globals()[dst_name] = make_fielding_out_rate_deltas(df)
    print(f"{dst_name}: created")

game_fielding_deltas_2022: created
game_fielding_deltas_2023: created
game_fielding_deltas_2024: created
game_fielding_deltas_2025: created


In [94]:
display(HTML("<h4>Season 2022</h4>")); display(game_fielding_deltas_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(game_fielding_deltas_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(game_fielding_deltas_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(game_fielding_deltas_2025.head(5))


Unnamed: 0,game_id,game_date,home_team,away_team,ΔBIP_out_rate_3G,ΔBIP_out_rate_7G
0,661042,2022-04-07,LAA,HOU,-0.030054,-0.030054
1,661577,2022-04-07,ATL,CIN,0.016629,0.016629
2,662021,2022-04-07,STL,PIT,0.028652,0.028652
3,662571,2022-04-07,WSH,NYM,-0.004304,-0.004304
4,662766,2022-04-07,KC,CLE,-0.019316,-0.019316


Unnamed: 0,game_id,game_date,home_team,away_team,ΔBIP_out_rate_3G,ΔBIP_out_rate_7G
0,718767,2023-03-30,SEA,CLE,-0.006248,-0.006248
1,718768,2023-03-30,HOU,CWS,0.027737,0.027737
2,718769,2023-03-30,ATH,LAA,-0.012409,-0.012409
3,718770,2023-03-30,LAD,AZ,0.026501,0.026501
4,718772,2023-03-30,STL,TOR,0.009057,0.009057


Unnamed: 0,game_id,game_date,home_team,away_team,ΔBIP_out_rate_3G,ΔBIP_out_rate_7G
0,745444,2024-03-20,SD,LAD,-0.011795,-0.011795
1,746175,2024-03-21,LAD,SD,0.102564,0.102564
2,745039,2024-03-28,TEX,CHC,0.004203,0.004203
3,745116,2024-03-28,TB,TOR,0.013884,0.013884
4,745283,2024-03-28,SEA,BOS,0.017486,0.017486


Unnamed: 0,game_id,game_date,home_team,away_team,ΔBIP_out_rate_3G,ΔBIP_out_rate_7G
0,778563,2025-03-18,CHC,LAD,-0.007709,-0.007709
1,778564,2025-03-19,CHC,LAD,-0.137143,-0.137143
2,778545,2025-03-27,SD,ATL,0.008573,0.008573
3,778546,2025-03-27,LAD,DET,0.020628,0.020628
4,778547,2025-03-27,SEA,ATH,0.035341,0.035341


### Game Number Validation

**TODO**: Make a function that can work wtih all other dataframes, and rather than return the dataframe, return True or False and have an assert statement

In [95]:
fielding_deltas = {
    2022: game_fielding_deltas_2022,
    2023: game_fielding_deltas_2023,
    2024: game_fielding_deltas_2024,
    2025: game_fielding_deltas_2025,
}

rows = []

for year, df in fielding_deltas.items():
    games = df[["game_id", "home_team", "away_team"]].drop_duplicates()

    long = pd.concat(
        [
            games[["game_id", "home_team"]].rename(columns={"home_team": "team"}),
            games[["game_id", "away_team"]].rename(columns={"away_team": "team"}),
        ],
        ignore_index=True,
    ).drop_duplicates()

    long["season"] = year
    rows.append(long)

fielding_games_per_team = (
    pd.concat(rows, ignore_index=True)
      .groupby(["team", "season"])["game_id"]
      .nunique()
      .unstack("season")
      .sort_index()
)

fielding_games_per_team

season,2022,2023,2024,2025
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ATH,162,162,162,162
ATL,162,162,162,162
AZ,162,162,162,162
BAL,162,162,162,162
BOS,162,162,162,162
CHC,162,162,162,162
CIN,162,162,162,162
CLE,162,162,161,162
COL,162,162,162,162
CWS,162,162,162,162


## Combining Pitching, Batting, and Fielding Data

Now, we can finally combine pitching, batting, and fielding data. 

In [96]:
for y in range(2022, 2026):
    # pitching + batting
    df = combine_pitching_batting_deltas(
        pitching_deltas=globals()[f"game_pitching_deltas_{y}"],
        batting_deltas=globals()[f"game_batting_deltas_{y}"],
        how="inner",
    )

    # normalize team cols from that merge
    df = df.drop(columns=["home_team_y", "away_team_y"], errors="ignore").rename(
        columns={"home_team_x": "home_team", "away_team_x": "away_team"}
    )

    # add fielding deltas
    fld = globals()[f"game_fielding_deltas_{y}"]

    df = df.merge(
        fld,
        on=["game_id", "game_date", "home_team", "away_team"],
        how="inner",
        validate="one_to_one",
    )

    globals()[f"game_features_{y}"] = df
    print(f"game_features_{y}: {df.shape}")

game_features_2022: (2430, 22)
game_features_2023: (2430, 22)
game_features_2024: (2429, 22)
game_features_2025: (2430, 22)


In [97]:
display(HTML("<h4>Season 2022</h4>")); display(game_features_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(game_features_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(game_features_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(game_features_2025.head(5))

Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,...,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO,ΔBIP_out_rate_3G,ΔBIP_out_rate_7G
0,661042,2022-04-07,LAA,HOU,"Ohtani, Shohei","Valdez, Framber",-0.501346,-0.173269,2.3625,0.228462,...,-0.173269,2.3625,0.228462,-0.010856,-0.02716,-0.013193,-0.02716,-0.013193,-0.030054,-0.030054
1,661577,2022-04-07,ATL,CIN,"Fried, Max","Mahle, Tyler",-0.500258,-0.156226,-1.896095,-0.385842,...,-0.156226,-1.896095,-0.385842,-0.485123,-0.011229,0.006274,-0.011229,0.006274,0.016629,0.016629
2,662021,2022-04-07,STL,PIT,"Wainwright, Adam","Brubaker, JT",-1.496179,-0.272351,-1.838997,-1.136493,...,-0.272351,-1.838997,-1.136493,-0.449753,0.004731,0.038504,0.004731,0.038504,0.028652,0.028652
3,662571,2022-04-07,WSH,NYM,"Corbin, Patrick","Megill, Tylor",0.740873,0.193245,-2.388419,0.048493,...,0.193245,-2.388419,0.048493,0.811662,0.021413,0.008,0.021413,0.008,-0.004304,-0.004304
4,662766,2022-04-07,KC,CLE,"Greinke, Zack","Bieber, Shane",1.678132,-0.063212,-6.238713,0.528978,...,-0.063212,-6.238713,0.528978,0.139373,0.005893,-0.01945,0.005893,-0.01945,-0.019316,-0.019316


Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,...,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO,ΔBIP_out_rate_3G,ΔBIP_out_rate_7G
0,718767,2023-03-30,SEA,CLE,"Castillo, Luis","Bieber, Shane",0.202156,0.087045,1.087248,-0.032947,...,0.087045,1.087248,-0.032947,0.259986,-0.002744,0.030771,-0.002744,0.030771,-0.006248,-0.006248
1,718768,2023-03-30,HOU,CWS,"Valdez, Framber","Cease, Dylan",-0.038578,0.082587,-2.501297,-0.296834,...,0.082587,-2.501297,-0.296834,-0.554982,0.009051,0.042771,0.009051,0.042771,0.027737,0.027737
2,718769,2023-03-30,ATH,LAA,"Muller, Kyle","Ohtani, Shohei",2.71371,0.587159,-3.613679,0.622519,...,0.587159,-3.613679,0.622519,-0.15673,-0.016848,-0.026978,-0.016848,-0.026978,-0.012409,-0.012409
3,718770,2023-03-30,LAD,AZ,"Urías, Julio","Gallen, Zac",0.670662,0.005336,-0.909239,0.448819,...,0.005336,-0.909239,0.448819,-1.159878,0.029503,0.029917,0.029503,0.029917,0.026501,0.026501
4,718772,2023-03-30,STL,TOR,"Mikolas, Miles","Manoah, Alek",0.545019,-0.004475,-1.346344,0.401137,...,-0.004475,-1.346344,0.401137,-0.110717,-0.003963,-0.000672,-0.003963,-0.000672,0.009057,0.009057


Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,...,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO,ΔBIP_out_rate_3G,ΔBIP_out_rate_7G
0,745444,2024-03-20,SD,LAD,"Darvish, Yu","Glasnow, Tyler",1.100972,0.27806,-2.70335,0.230955,...,0.27806,-2.70335,0.230955,0.495107,-0.010627,-0.030386,-0.010627,-0.030386,-0.011795,-0.011795
1,746175,2024-03-21,LAD,SD,"Yamamoto, Yoshinobu","Musgrove, Joe",0.910845,0.121779,-0.486247,0.406206,...,0.121779,-0.486247,0.406206,-3.9375,0.142484,0.0,0.142484,0.0,0.102564,0.102564
2,745039,2024-03-28,TEX,CHC,"Eovaldi, Nathan","Steele, Justin",0.871255,-0.019033,-0.807548,0.220974,...,-0.019033,-0.807548,0.220974,0.391994,0.007217,0.023539,0.007217,0.023539,0.004203,0.004203
3,745116,2024-03-28,TB,TOR,"Eflin, Zach","Berríos, José",-1.009147,-0.203679,0.650943,-0.235337,...,-0.203679,0.650943,-0.235337,0.117121,0.003449,0.025706,0.003449,0.025706,0.013884,0.013884
4,745283,2024-03-28,SEA,BOS,"Castillo, Luis","Bello, Brayan",-0.743401,-0.266235,2.410068,-0.10726,...,-0.266235,2.410068,-0.10726,-0.421465,-0.001748,0.004338,-0.001748,0.004338,0.017486,0.017486


Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,...,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO,ΔBIP_out_rate_3G,ΔBIP_out_rate_7G
0,778563,2025-03-18,CHC,LAD,"Imanaga, Shota","Yamamoto, Yoshinobu",1.123302,-0.073846,-1.38462,0.721391,...,-0.073846,-1.38462,0.721391,-0.116271,-0.019315,-0.038771,-0.019315,-0.038771,-0.007709,-0.007709
1,778564,2025-03-19,CHC,LAD,"Steele, Justin","Sasaki, Roki",-0.933196,-0.189999,0.617142,-0.422758,...,-0.189999,0.617142,-0.422758,1.35,-0.200893,-0.02549,-0.200893,-0.02549,-0.137143,-0.137143
2,778545,2025-03-27,SD,ATL,"King, Michael","Sale, Chris",1.198333,0.157857,-1.071429,0.394286,...,0.157857,-1.071429,0.394286,0.187259,0.012415,-0.013947,0.012415,-0.013947,0.008573,0.008573
3,778546,2025-03-27,LAD,DET,"Snell, Blake","Skubal, Tarik",-0.070999,0.092437,1.907665,-0.18377,...,0.092437,1.907665,-0.18377,-2.581356,0.076655,0.25373,0.076655,0.25373,0.020628,0.020628
4,778547,2025-03-27,SEA,ATH,"Gilbert, Logan","Severino, Luis",-0.960329,-0.41391,1.439538,-0.029619,...,-0.41391,1.439538,-0.029619,0.093181,0.010867,-0.005802,0.010867,-0.005802,0.035341,0.035341


**TODO**: Make a function that can work wtih all other dataframes, and rather than return the dataframe, return True or False and have an assert statement

In [98]:
game_features = {
    2022: game_features_2022,
    2023: game_features_2023,
    2024: game_features_2024,
    2025: game_features_2025,
}

# 1) sanity: should be 1 row per game_id
for year, df in game_features.items():
    print(f"{year}: rows={len(df):,} | unique game_id={df['game_id'].nunique():,}")

# 2) counts per team, seasons as columns
rows = []
for year, df in game_features.items():
    games = df[["game_id", "home_team", "away_team"]].drop_duplicates()

    long = pd.concat(
        [
            games[["game_id", "home_team"]].rename(columns={"home_team": "team"}),
            games[["game_id", "away_team"]].rename(columns={"away_team": "team"}),
        ],
        ignore_index=True,
    ).drop_duplicates()

    long["season"] = year
    rows.append(long)

game_features_games_per_team = (
    pd.concat(rows, ignore_index=True)
      .groupby(["team", "season"])["game_id"]
      .nunique()
      .unstack("season")
      .sort_index()
)

game_features_games_per_team


2022: rows=2,430 | unique game_id=2,430
2023: rows=2,430 | unique game_id=2,430
2024: rows=2,429 | unique game_id=2,429
2025: rows=2,430 | unique game_id=2,430


season,2022,2023,2024,2025
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ATH,162,162,162,162
ATL,162,162,162,162
AZ,162,162,162,162
BAL,162,162,162,162
BOS,162,162,162,162
CHC,162,162,162,162
CIN,162,162,162,162
CLE,162,162,161,162
COL,162,162,162,162
CWS,162,162,162,162


### Appending Game Scores

In [99]:
for y in range(2022, 2026):
    feat_name = f"game_features_{y}"
    out_name  = f"game_outcomes_{y}"

    feat = globals().get(feat_name)
    outs = globals().get(out_name)

    if feat is None or outs is None:
        print(f"{y}: missing {feat_name} or {out_name}")
        continue

    outs_small = outs[["game_id", "home_win", "run_diff"]].copy()

    merged = feat.merge(
        outs_small,
        on="game_id",
        how="inner",          # change to "left" if you want to keep all feature rows
        validate="one_to_one"
    )

    globals()[feat_name] = merged
    print(f"{feat_name}: {merged.shape} (outcomes added)")

game_features_2022: (2430, 24) (outcomes added)
game_features_2023: (2430, 24) (outcomes added)
game_features_2024: (2429, 24) (outcomes added)
game_features_2025: (2430, 24) (outcomes added)


In [100]:
display(HTML("<h4>Season 2022</h4>")); display(game_features_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(game_features_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(game_features_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(game_features_2025.head(5))

Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,...,Δstarter_HR9_7D,Δbullpen_FIP_7D,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO,ΔBIP_out_rate_3G,ΔBIP_out_rate_7G,home_win,run_diff
0,661042,2022-04-07,LAA,HOU,"Ohtani, Shohei","Valdez, Framber",-0.501346,-0.173269,2.3625,0.228462,...,0.228462,-0.010856,-0.02716,-0.013193,-0.02716,-0.013193,-0.030054,-0.030054,0,-2
1,661577,2022-04-07,ATL,CIN,"Fried, Max","Mahle, Tyler",-0.500258,-0.156226,-1.896095,-0.385842,...,-0.385842,-0.485123,-0.011229,0.006274,-0.011229,0.006274,0.016629,0.016629,0,-3
2,662021,2022-04-07,STL,PIT,"Wainwright, Adam","Brubaker, JT",-1.496179,-0.272351,-1.838997,-1.136493,...,-1.136493,-0.449753,0.004731,0.038504,0.004731,0.038504,0.028652,0.028652,1,9
3,662571,2022-04-07,WSH,NYM,"Corbin, Patrick","Megill, Tylor",0.740873,0.193245,-2.388419,0.048493,...,0.048493,0.811662,0.021413,0.008,0.021413,0.008,-0.004304,-0.004304,0,-4
4,662766,2022-04-07,KC,CLE,"Greinke, Zack","Bieber, Shane",1.678132,-0.063212,-6.238713,0.528978,...,0.528978,0.139373,0.005893,-0.01945,0.005893,-0.01945,-0.019316,-0.019316,1,2


Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,...,Δstarter_HR9_7D,Δbullpen_FIP_7D,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO,ΔBIP_out_rate_3G,ΔBIP_out_rate_7G,home_win,run_diff
0,718767,2023-03-30,SEA,CLE,"Castillo, Luis","Bieber, Shane",0.202156,0.087045,1.087248,-0.032947,...,-0.032947,0.259986,-0.002744,0.030771,-0.002744,0.030771,-0.006248,-0.006248,1,3
1,718768,2023-03-30,HOU,CWS,"Valdez, Framber","Cease, Dylan",-0.038578,0.082587,-2.501297,-0.296834,...,-0.296834,-0.554982,0.009051,0.042771,0.009051,0.042771,0.027737,0.027737,0,-1
2,718769,2023-03-30,ATH,LAA,"Muller, Kyle","Ohtani, Shohei",2.71371,0.587159,-3.613679,0.622519,...,0.622519,-0.15673,-0.016848,-0.026978,-0.016848,-0.026978,-0.012409,-0.012409,1,1
3,718770,2023-03-30,LAD,AZ,"Urías, Julio","Gallen, Zac",0.670662,0.005336,-0.909239,0.448819,...,0.448819,-1.159878,0.029503,0.029917,0.029503,0.029917,0.026501,0.026501,1,6
4,718772,2023-03-30,STL,TOR,"Mikolas, Miles","Manoah, Alek",0.545019,-0.004475,-1.346344,0.401137,...,0.401137,-0.110717,-0.003963,-0.000672,-0.003963,-0.000672,0.009057,0.009057,0,-1


Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,...,Δstarter_HR9_7D,Δbullpen_FIP_7D,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO,ΔBIP_out_rate_3G,ΔBIP_out_rate_7G,home_win,run_diff
0,745444,2024-03-20,SD,LAD,"Darvish, Yu","Glasnow, Tyler",1.100972,0.27806,-2.70335,0.230955,...,0.230955,0.495107,-0.010627,-0.030386,-0.010627,-0.030386,-0.011795,-0.011795,0,-3
1,746175,2024-03-21,LAD,SD,"Yamamoto, Yoshinobu","Musgrove, Joe",0.910845,0.121779,-0.486247,0.406206,...,0.406206,-3.9375,0.142484,0.0,0.142484,0.0,0.102564,0.102564,0,-4
2,745039,2024-03-28,TEX,CHC,"Eovaldi, Nathan","Steele, Justin",0.871255,-0.019033,-0.807548,0.220974,...,0.220974,0.391994,0.007217,0.023539,0.007217,0.023539,0.004203,0.004203,1,1
3,745116,2024-03-28,TB,TOR,"Eflin, Zach","Berríos, José",-1.009147,-0.203679,0.650943,-0.235337,...,-0.235337,0.117121,0.003449,0.025706,0.003449,0.025706,0.013884,0.013884,0,-6
4,745283,2024-03-28,SEA,BOS,"Castillo, Luis","Bello, Brayan",-0.743401,-0.266235,2.410068,-0.10726,...,-0.10726,-0.421465,-0.001748,0.004338,-0.001748,0.004338,0.017486,0.017486,0,-2


Unnamed: 0,game_id,game_date,home_team,away_team,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,...,Δstarter_HR9_7D,Δbullpen_FIP_7D,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO,ΔBIP_out_rate_3G,ΔBIP_out_rate_7G,home_win,run_diff
0,778563,2025-03-18,CHC,LAD,"Imanaga, Shota","Yamamoto, Yoshinobu",1.123302,-0.073846,-1.38462,0.721391,...,0.721391,-0.116271,-0.019315,-0.038771,-0.019315,-0.038771,-0.007709,-0.007709,0,-3
1,778564,2025-03-19,CHC,LAD,"Steele, Justin","Sasaki, Roki",-0.933196,-0.189999,0.617142,-0.422758,...,-0.422758,1.35,-0.200893,-0.02549,-0.200893,-0.02549,-0.137143,-0.137143,0,-3
2,778545,2025-03-27,SD,ATL,"King, Michael","Sale, Chris",1.198333,0.157857,-1.071429,0.394286,...,0.394286,0.187259,0.012415,-0.013947,0.012415,-0.013947,0.008573,0.008573,1,3
3,778546,2025-03-27,LAD,DET,"Snell, Blake","Skubal, Tarik",-0.070999,0.092437,1.907665,-0.18377,...,-0.18377,-2.581356,0.076655,0.25373,0.076655,0.25373,0.020628,0.020628,1,1
4,778547,2025-03-27,SEA,ATH,"Gilbert, Logan","Severino, Luis",-0.960329,-0.41391,1.439538,-0.029619,...,-0.029619,0.093181,0.010867,-0.005802,0.010867,-0.005802,0.035341,0.035341,1,2
