# Game Preprocessing

This notebook prepares and cleans 2021 through 2025 MLB game data for modeling.


In [1]:
from functools import lru_cache
import pandas as pd
import numpy as np
from pathlib import Path 
from IPython.display import display, HTML
import sys

In [2]:
# Add repo_root/src to PYTHONPATH
repo_root = Path.cwd()  # if your notebook is in the repo root
src_path = repo_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))


In [3]:
from preprocessing.preprocessing_common import (
    add_game_id,
    merge_game_number_and_pitcher,
    trim_game_id_inplace,
    append_game_number_to_game_id,
    PA_ENDING_EVENTS,
    filter_plate_appearances,
    combine_pitching_batting_deltas,
)

In [4]:
from preprocessing.pitching_preprocessing import (
    add_starter_indicator_pitchlevel,
    add_pitching_indicators,
    split_starter_bullpen,
    aggregate_pitching_game_lines,
    add_rolling_pitching_counts,
    add_rate_metrics_from_rolled_counts,
    combine_game_level_pitching_rolling_rates,
    make_pitching_delta_df,
)


In [5]:
from preprocessing.batting_preprocessing import (
    add_batting_indicators,
    split_batting_home_away,
    aggregate_team_game_batting,
    add_time_rolling_batting_sums,
    add_rolling_obp_iso,
    add_rolling_obp_iso_batch,
    combine_home_away_batting_rolls,
    make_batting_delta_df,
)


## Reading in Data

Below, I read in season-level data from **2021–2025** using a cached helper function to avoid repeated disk reads and improve performance.

In [6]:
data_dir = Path("data/raw_season_data")
season_files = {y: data_dir / f"season_{y}.csv" for y in range(2021, 2026)}

read_kwargs = {
    # "usecols": [...],                 # select only needed columns
    # "parse_dates": [...],             # e.g., ["game_date"]
    # "dtype": {"batter_id": "int32"},  # downcast numerics where safe
    "engine": "pyarrow",                # faster & lower memory if available
    # "dtype_backend": "pyarrow",       # pandas 2.1+: keeps Arrow dtypes
}

@lru_cache(maxsize=None)
def load_season(year: int) -> pd.DataFrame:
    df = pd.read_csv(season_files[year], **read_kwargs)
    return df

In [7]:
season_2021 = load_season(2021)
season_2022 = load_season(2022)
season_2023 = load_season(2023)
season_2024 = load_season(2024)
season_2025 = load_season(2025)

display(HTML("<h4>Season 2021</h4>")); display(season_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(season_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(season_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(season_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(season_2025.head(5))

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
0,FF,2021-10-03,92.3,1.4,6.8,"Smith, Will",596019,519293,field_out,hit_into_play,...,,1.28,0.69,-0.69,47.4,,,,,
1,SL,2021-10-03,80.6,1.6,6.64,"Smith, Will",596019,519293,,foul,...,,2.99,-0.77,0.77,44.3,,,,,
2,CU,2021-10-03,75.5,1.46,6.88,"Smith, Will",596019,519293,,foul,...,,4.52,-0.65,0.65,51.7,,,,,
3,CU,2021-10-03,75.0,1.53,6.83,"Smith, Will",596019,519293,,ball,...,,4.74,-0.69,0.69,49.5,,,,,
4,FF,2021-10-03,91.2,1.49,6.66,"Smith, Will",607043,519293,field_out,hit_into_play,...,,1.49,0.63,0.63,44.0,,,,,


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
0,CH,2022-10-05,80.8,-0.76,6.61,"Baker, Bryan",624415,641329,field_out,hit_into_play,...,0.0,2.68,1.34,-1.34,59.9,,,,,
1,FF,2022-10-05,97.7,-0.58,6.6,"Baker, Bryan",643376,641329,strikeout,swinging_strike,...,0.0,0.81,0.17,0.17,53.6,,,,,
2,CH,2022-10-05,84.9,-0.55,6.58,"Baker, Bryan",643376,641329,,ball,...,0.0,2.34,1.22,1.22,58.4,,,,,
3,FF,2022-10-05,97.2,-0.42,6.6,"Baker, Bryan",643376,641329,,swinging_strike,...,0.0,0.68,0.13,0.13,57.2,,,,,
4,SL,2022-10-05,86.2,-0.55,6.64,"Baker, Bryan",643376,641329,,called_strike,...,0.0,3.04,-0.63,-0.63,58.8,,,,,


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
0,CH,2023-10-01,89.0,-2.8,5.59,"Robertson, Nick",677008,687798,field_out,hit_into_play,...,,2.55,1.53,-1.53,31.7,1.676715,-1.896554,41.830979,30.714944,26.41202
1,FF,2023-10-01,96.9,-2.4,5.9,"Robertson, Nick",677008,687798,,foul,...,,1.09,0.76,-0.76,47.4,8.715532,3.692542,40.551342,33.656454,26.020583
2,CH,2023-10-01,90.0,-2.93,5.56,"Robertson, Nick",677008,687798,,ball,...,,2.47,1.65,-1.65,30.3,,,,,
3,ST,2023-10-01,82.2,-3.09,5.55,"Robertson, Nick",677008,687798,,ball,...,,3.14,-1.43,1.43,28.9,,,,,
4,CH,2023-10-01,89.2,-2.87,5.58,"Robertson, Nick",677008,687798,,swinging_strike,...,,2.57,1.49,-1.49,34.3,20.169759,-7.584644,37.675911,44.236969,36.187039


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
0,FF,2024-09-30,97.4,-2.1,4.88,"Díaz, Edwin",518595,621242,field_out,hit_into_play,...,1.0,1.4,0.96,0.96,17.6,6.149605,12.090516,22.1604,45.805662,22.048373
1,SL,2024-09-30,90.7,-2.14,5.06,"Díaz, Edwin",518595,621242,,ball,...,1.0,2.14,-0.2,-0.2,23.1,,,,,
2,SL,2024-09-30,91.1,-2.07,5.14,"Díaz, Edwin",518595,621242,,swinging_strike,...,1.0,2.37,-0.12,-0.12,22.4,23.541699,-27.093819,34.778701,45.227965,45.368412
3,SL,2024-09-30,91.3,-2.05,5.07,"Díaz, Edwin",518595,621242,,ball,...,1.0,2.09,-0.21,-0.21,22.4,,,,,
4,SL,2024-09-30,89.1,-2.13,5.15,"Díaz, Edwin",518595,621242,,swinging_strike,...,1.0,2.2,-0.17,-0.17,20.2,23.112048,-30.629825,33.038132,53.011806,51.686541


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,attack_angle,attack_direction,swing_path_tilt,intercept_ball_minus_batter_pos_x_inches,intercept_ball_minus_batter_pos_y_inches
0,FF,2025-09-28,95.7,-2.15,5.21,"Weissert, Greg",678009,669711,field_out,hit_into_play,...,2.0,1.56,0.71,-0.71,20.9,5.991833,-1.319512,28.782516,41.559201,30.599805
1,FF,2025-09-28,95.1,-1.91,5.1,"Weissert, Greg",668670,669711,strikeout,called_strike,...,9.0,1.59,0.93,0.93,20.5,,,,,
2,FF,2025-09-28,95.4,-1.99,5.22,"Weissert, Greg",668670,669711,,foul,...,9.0,1.36,0.85,0.85,22.9,2.871131,31.805044,22.266527,37.478847,15.582717
3,SL,2025-09-28,84.8,-2.33,4.72,"Weissert, Greg",668670,669711,,swinging_strike,...,9.0,2.55,-0.32,-0.32,12.3,13.78541,4.08139,32.414181,38.011685,27.083341
4,SL,2025-09-28,85.3,-2.26,4.85,"Weissert, Greg",668670,669711,,called_strike,...,9.0,2.71,-0.52,-0.52,15.8,,,,,


## Examining Data

Below, I examine dataset dimensions, column consistency across seasons, and the columns selected from a representative dataframe.

### Dimensions

In [8]:
for y in range(2021, 2026):
    r, c = globals()[f"season_{y}"].shape
    print(f"season_{y}: {r:,} rows × {c} cols")

season_2021: 712,320 rows × 118 cols
season_2022: 710,210 rows × 118 cols
season_2023: 720,684 rows × 118 cols
season_2024: 732,481 rows × 118 cols
season_2025: 742,080 rows × 118 cols


### Column Consistency Check Across Seasons

- Defines the years to check (2021–2025).
- Uses `season_2025` as the reference column schema.
- Compares each `season_YYYY` dataset to the reference.
- Prints **OK** if column names *and order* match exactly, otherwise **DIFF**.
- Tracks whether all datasets match.
- Outputs a final `True/False` summary indicating full column consistency.

**Note:** This is a strict check — column order must also match.


In [9]:
years = range(2021, 2026)
ref = globals()["season_2025"].columns  # use 2025 as reference

print(f"[REFERENCE] season_2025 ({len(ref)} columns)\n")

all_match = True
for y in years:
    cols = globals()[f"season_{y}"].columns
    ok = cols.equals(ref)
    print(f"season_{y}: {'OK' if ok else 'DIFF'}")
    all_match &= ok

print("\nALL MATCH (names + order):", all_match)

[REFERENCE] season_2025 (118 columns)

season_2021: OK
season_2022: OK
season_2023: OK
season_2024: OK
season_2025: OK

ALL MATCH (names + order): True


### Printing Columns 

Since all columns match, I will print the `season_2025` columns.

In [10]:
season_2025.columns.tolist()

['pitch_type',
 'game_date',
 'release_speed',
 'release_pos_x',
 'release_pos_z',
 'player_name',
 'batter',
 'pitcher',
 'events',
 'description',
 'spin_dir',
 'spin_rate_deprecated',
 'break_angle_deprecated',
 'break_length_deprecated',
 'zone',
 'des',
 'game_type',
 'stand',
 'p_throws',
 'home_team',
 'away_team',
 'type',
 'hit_location',
 'bb_type',
 'balls',
 'strikes',
 'game_year',
 'pfx_x',
 'pfx_z',
 'plate_x',
 'plate_z',
 'on_3b',
 'on_2b',
 'on_1b',
 'outs_when_up',
 'inning',
 'inning_topbot',
 'hc_x',
 'hc_y',
 'tfs_deprecated',
 'tfs_zulu_deprecated',
 'umpire',
 'sv_id',
 'vx0',
 'vy0',
 'vz0',
 'ax',
 'ay',
 'az',
 'sz_top',
 'sz_bot',
 'hit_distance_sc',
 'launch_speed',
 'launch_angle',
 'effective_speed',
 'release_spin_rate',
 'release_extension',
 'game_pk',
 'fielder_2',
 'fielder_3',
 'fielder_4',
 'fielder_5',
 'fielder_6',
 'fielder_7',
 'fielder_8',
 'fielder_9',
 'release_pos_y',
 'estimated_ba_using_speedangle',
 'estimated_woba_using_speedangle',
 'w

## Regular Season Games

Below, I filter each season dataset (2021–2025) to retain **regular season** games only (`game_type == "R"`).

In [11]:
for y in range(2021, 2026):
    name = f"season_{y}"
    globals()[name] = globals()[name].loc[globals()[name]["game_type"] == "R"].copy()

## Dropping Columns

Below, I retain only the columns needed for calculating our batting and pitching features.


In [12]:
cols_to_keep = [
    "game_date",
    "home_team",
    "away_team",
    "inning",
    "inning_topbot",
    "pitch_number",
    "outs_when_up",
    "home_score",
    "away_score",
    "events",
    "description",
    "batter",
    "pitcher",
    "player_name"
]

for y in range(2021, 2026):
    name = f"season_{y}"
    df = globals().get(name)
    if df is None:
        print(f"{name}: (not loaded)")
        continue

    # Keep only relevant columns (skip missing ones safely)
    available = [c for c in cols_to_keep if c in df.columns]
    globals()[name] = df[available]

    print(f"{name}: kept {len(available)} columns")

season_2021: kept 14 columns
season_2022: kept 14 columns
season_2023: kept 14 columns
season_2024: kept 14 columns
season_2025: kept 14 columns


In [13]:
display(HTML("<h4>Season 2021</h4>")); display(season_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(season_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(season_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(season_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(season_2025.head(5))

Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter,pitcher,player_name
0,2021-10-03,ATL,NYM,9,Top,4,2,5,0,field_out,hit_into_play,596019,519293,"Smith, Will"
1,2021-10-03,ATL,NYM,9,Top,3,2,5,0,,foul,596019,519293,"Smith, Will"
2,2021-10-03,ATL,NYM,9,Top,2,2,5,0,,foul,596019,519293,"Smith, Will"
3,2021-10-03,ATL,NYM,9,Top,1,2,5,0,,ball,596019,519293,"Smith, Will"
4,2021-10-03,ATL,NYM,9,Top,2,1,5,0,field_out,hit_into_play,607043,519293,"Smith, Will"


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter,pitcher,player_name
0,2022-10-05,BAL,TOR,9,Top,1,2,5,4,field_out,hit_into_play,624415,641329,"Baker, Bryan"
1,2022-10-05,BAL,TOR,9,Top,5,1,5,4,strikeout,swinging_strike,643376,641329,"Baker, Bryan"
2,2022-10-05,BAL,TOR,9,Top,4,1,5,4,,ball,643376,641329,"Baker, Bryan"
3,2022-10-05,BAL,TOR,9,Top,3,1,5,4,,swinging_strike,643376,641329,"Baker, Bryan"
4,2022-10-05,BAL,TOR,9,Top,2,1,5,4,,called_strike,643376,641329,"Baker, Bryan"


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter,pitcher,player_name
0,2023-10-01,BAL,BOS,9,Bot,6,2,1,6,field_out,hit_into_play,677008,687798,"Robertson, Nick"
1,2023-10-01,BAL,BOS,9,Bot,5,2,1,6,,foul,677008,687798,"Robertson, Nick"
2,2023-10-01,BAL,BOS,9,Bot,4,2,1,6,,ball,677008,687798,"Robertson, Nick"
3,2023-10-01,BAL,BOS,9,Bot,3,2,1,6,,ball,677008,687798,"Robertson, Nick"
4,2023-10-01,BAL,BOS,9,Bot,2,2,1,6,,swinging_strike,677008,687798,"Robertson, Nick"


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter,pitcher,player_name
0,2024-09-30,ATL,NYM,9,Bot,5,2,7,8,field_out,hit_into_play,518595,621242,"Díaz, Edwin"
1,2024-09-30,ATL,NYM,9,Bot,4,2,7,8,,ball,518595,621242,"Díaz, Edwin"
2,2024-09-30,ATL,NYM,9,Bot,3,2,7,8,,swinging_strike,518595,621242,"Díaz, Edwin"
3,2024-09-30,ATL,NYM,9,Bot,2,2,7,8,,ball,518595,621242,"Díaz, Edwin"
4,2024-09-30,ATL,NYM,9,Bot,1,2,7,8,,swinging_strike,518595,621242,"Díaz, Edwin"


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter,pitcher,player_name
0,2025-09-28,BOS,DET,9,Top,1,2,4,3,field_out,hit_into_play,678009,669711,"Weissert, Greg"
1,2025-09-28,BOS,DET,9,Top,4,1,4,3,strikeout,called_strike,668670,669711,"Weissert, Greg"
2,2025-09-28,BOS,DET,9,Top,3,1,4,3,,foul,668670,669711,"Weissert, Greg"
3,2025-09-28,BOS,DET,9,Top,2,1,4,3,,swinging_strike,668670,669711,"Weissert, Greg"
4,2025-09-28,BOS,DET,9,Top,1,1,4,3,,called_strike,668670,669711,"Weissert, Greg"


### Changing `player_name` to `pitcher_name`

In [14]:
rename_map = {
    "player_name": "pitcher_name",
    "batter": "batter_id",
    "pitcher": "pitcher_id"
}

for year in range(2021, 2026):
    df_name = f"season_{year}"
    globals()[df_name] = globals()[df_name].rename(columns=rename_map)


## Double Headers and Game ID

One issue with Statcast is that, while it provides extensive at-bat–level data, it does not include a unique game identifier, which makes distinguishing doubleheaders difficult. If doubleheaders are not handled properly, it becomes difficult to correctly identify starting pitchers for each game, which can in turn lead to incorrect rolling batter and pitcher metrics. As a result, I use the functions developed in the [Double Headers](https://github.com/Sam-Gartenstein/home-run-prediction/blob/main/Double%20Headers.ipynb) notebook from the [Home Run Prediction](https://github.com/Sam-Gartenstein/home-run-prediction/tree/main) project.



### Unique Game IDs

Now, we add a unique `game_id` to each row to support reliable joins—especially when matching to doubleheader data where the same teams can play multiple games on the same date. The `game_id` is constructed by combining the game date (formatted as `YYYYMMDD`), the matchup (`away_team@home_team`), and a cleaned version of the pitcher’s name (commas removed and spaces standardized). This produces a consistent identifier that distinguishes games on the same day and helps prevent accidental mismatches when merging datasets.


In [15]:
season_2021 = add_game_id(season_2021)
season_2022 = add_game_id(season_2022)
season_2023 = add_game_id(season_2023)
season_2024 = add_game_id(season_2024)
season_2025 = add_game_id(season_2025)

In [16]:
display(HTML("<h4>Season 2021</h4>")); display(season_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(season_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(season_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(season_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(season_2025.head(5))

Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id
0,2021-10-03,ATL,NYM,9,Top,4,2,5,0,field_out,hit_into_play,596019,"Smith, Will",20211003_NYM@ATL_Smith_Will
1,2021-10-03,ATL,NYM,9,Top,3,2,5,0,,foul,596019,"Smith, Will",20211003_NYM@ATL_Smith_Will
2,2021-10-03,ATL,NYM,9,Top,2,2,5,0,,foul,596019,"Smith, Will",20211003_NYM@ATL_Smith_Will
3,2021-10-03,ATL,NYM,9,Top,1,2,5,0,,ball,596019,"Smith, Will",20211003_NYM@ATL_Smith_Will
4,2021-10-03,ATL,NYM,9,Top,2,1,5,0,field_out,hit_into_play,607043,"Smith, Will",20211003_NYM@ATL_Smith_Will


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id
0,2022-10-05,BAL,TOR,9,Top,1,2,5,4,field_out,hit_into_play,624415,"Baker, Bryan",20221005_TOR@BAL_Baker_Bryan
1,2022-10-05,BAL,TOR,9,Top,5,1,5,4,strikeout,swinging_strike,643376,"Baker, Bryan",20221005_TOR@BAL_Baker_Bryan
2,2022-10-05,BAL,TOR,9,Top,4,1,5,4,,ball,643376,"Baker, Bryan",20221005_TOR@BAL_Baker_Bryan
3,2022-10-05,BAL,TOR,9,Top,3,1,5,4,,swinging_strike,643376,"Baker, Bryan",20221005_TOR@BAL_Baker_Bryan
4,2022-10-05,BAL,TOR,9,Top,2,1,5,4,,called_strike,643376,"Baker, Bryan",20221005_TOR@BAL_Baker_Bryan


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id
0,2023-10-01,BAL,BOS,9,Bot,6,2,1,6,field_out,hit_into_play,677008,"Robertson, Nick",20231001_BOS@BAL_Robertson_Nick
1,2023-10-01,BAL,BOS,9,Bot,5,2,1,6,,foul,677008,"Robertson, Nick",20231001_BOS@BAL_Robertson_Nick
2,2023-10-01,BAL,BOS,9,Bot,4,2,1,6,,ball,677008,"Robertson, Nick",20231001_BOS@BAL_Robertson_Nick
3,2023-10-01,BAL,BOS,9,Bot,3,2,1,6,,ball,677008,"Robertson, Nick",20231001_BOS@BAL_Robertson_Nick
4,2023-10-01,BAL,BOS,9,Bot,2,2,1,6,,swinging_strike,677008,"Robertson, Nick",20231001_BOS@BAL_Robertson_Nick


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id
0,2024-09-30,ATL,NYM,9,Bot,5,2,7,8,field_out,hit_into_play,518595,"Díaz, Edwin",20240930_NYM@ATL_Díaz_Edwin
1,2024-09-30,ATL,NYM,9,Bot,4,2,7,8,,ball,518595,"Díaz, Edwin",20240930_NYM@ATL_Díaz_Edwin
2,2024-09-30,ATL,NYM,9,Bot,3,2,7,8,,swinging_strike,518595,"Díaz, Edwin",20240930_NYM@ATL_Díaz_Edwin
3,2024-09-30,ATL,NYM,9,Bot,2,2,7,8,,ball,518595,"Díaz, Edwin",20240930_NYM@ATL_Díaz_Edwin
4,2024-09-30,ATL,NYM,9,Bot,1,2,7,8,,swinging_strike,518595,"Díaz, Edwin",20240930_NYM@ATL_Díaz_Edwin


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id
0,2025-09-28,BOS,DET,9,Top,1,2,4,3,field_out,hit_into_play,678009,"Weissert, Greg",20250928_DET@BOS_Weissert_Greg
1,2025-09-28,BOS,DET,9,Top,4,1,4,3,strikeout,called_strike,668670,"Weissert, Greg",20250928_DET@BOS_Weissert_Greg
2,2025-09-28,BOS,DET,9,Top,3,1,4,3,,foul,668670,"Weissert, Greg",20250928_DET@BOS_Weissert_Greg
3,2025-09-28,BOS,DET,9,Top,2,1,4,3,,swinging_strike,668670,"Weissert, Greg",20250928_DET@BOS_Weissert_Greg
4,2025-09-28,BOS,DET,9,Top,1,1,4,3,,called_strike,668670,"Weissert, Greg",20250928_DET@BOS_Weissert_Greg


### Reading in Double Headers

Next, I read in the doubleheader reference tables created in the aforementioned project. These files identify games that were part of a doubleheader for each season, which is important because multiple games can share the same date and matchup, and we need a reliable way to distinguish them when merging datasets.

The code below loads `double_headers_{year}.csv` from `data/double_headers` for seasons 2022–2025, stores each season’s table in a dictionary, and assigns convenience variables (`double_headers_22`, `double_headers_23`, etc.). 


**TODO**: Go to Double Header notebook and add for 2021 so the ID's are proper and there are no issues with filling in missing data.

In [17]:
indir = Path("data") / "double_headers"

double_headers = {}

for year in range(2022, 2026):
    fp = indir / f"double_headers_{year}.csv"
    if fp.exists():
        double_headers[year] = pd.read_csv(fp)
        print(f"[loaded] {fp}")
    else:
        print(f"[missing] {fp}")

# Recreate the same variable names if you want them back
double_headers_22 = double_headers.get(2022)
double_headers_23 = double_headers.get(2023)
double_headers_24 = double_headers.get(2024)
double_headers_25 = double_headers.get(2025)

[loaded] data/double_headers/double_headers_2022.csv
[loaded] data/double_headers/double_headers_2023.csv
[loaded] data/double_headers/double_headers_2024.csv
[loaded] data/double_headers/double_headers_2025.csv


In [19]:
display(HTML("<h4>Season 2022</h4>")); display(double_headers_22.head(10))
display(HTML("<h4>Season 2023</h4>")); display(double_headers_23.head(10))
display(HTML("<h4>Season 2024</h4>")); display(double_headers_24.head(10))
display(HTML("<h4>Season 2025</h4>")); display(double_headers_25.head(10))

Unnamed: 0,date,teams,away_team,home_team,game_number,bref_url,side,pitcher,game_id
0,2022-04-19,Arizona at Washington,AZ,WSH,1,https://www.baseball-reference.com/boxes/WAS/W...,away,"Bumgarner, Madison",20220419_AZ@WSH_Bumgarner_Madison
1,2022-04-19,Arizona at Washington,AZ,WSH,1,https://www.baseball-reference.com/boxes/WAS/W...,away,"Wendelken, J.B.",20220419_AZ@WSH_Wendelken_J.B.
2,2022-04-19,Arizona at Washington,AZ,WSH,1,https://www.baseball-reference.com/boxes/WAS/W...,away,"Pérez, Óliver",20220419_AZ@WSH_Pérez_Óliver
3,2022-04-19,Arizona at Washington,AZ,WSH,1,https://www.baseball-reference.com/boxes/WAS/W...,away,"Peacock, Matt",20220419_AZ@WSH_Peacock_Matt
4,2022-04-19,San Francisco at NY Mets,SF,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,away,"Cobb, Alex",20220419_SF@NYM_Cobb_Alex
5,2022-04-19,San Francisco at NY Mets,SF,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,away,"Leone, Dominic",20220419_SF@NYM_Leone_Dominic
6,2022-04-19,San Francisco at NY Mets,SF,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,away,"Álvarez, José",20220419_SF@NYM_Álvarez_José
7,2022-04-19,San Francisco at NY Mets,SF,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,away,"McGee, Jake",20220419_SF@NYM_McGee_Jake
8,2022-04-19,San Francisco at NY Mets,SF,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,away,"Rogers, Tyler",20220419_SF@NYM_Rogers_Tyler
9,2022-04-19,San Francisco at NY Mets,SF,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,away,"Doval, Camilo",20220419_SF@NYM_Doval_Camilo


Unnamed: 0,date,teams,away_team,home_team,game_number,bref_url,side,pitcher,game_id
0,2023-04-18,Cleveland at Detroit,CLE,DET,1,https://www.baseball-reference.com/boxes/DET/D...,away,"Gaddis, Hunter",20230418_CLE@DET_Gaddis_Hunter
1,2023-04-18,Cleveland at Detroit,CLE,DET,1,https://www.baseball-reference.com/boxes/DET/D...,away,"Morgan, Eli",20230418_CLE@DET_Morgan_Eli
2,2023-04-18,Cleveland at Detroit,CLE,DET,1,https://www.baseball-reference.com/boxes/DET/D...,away,"Sandlin, Nick",20230418_CLE@DET_Sandlin_Nick
3,2023-04-18,Cleveland at Detroit,CLE,DET,1,https://www.baseball-reference.com/boxes/DET/D...,away,"Karinchak, James",20230418_CLE@DET_Karinchak_James
4,2023-04-18,Philadelphia at Chicago Sox,PHI,CWS,1,https://www.baseball-reference.com/boxes/CHA/C...,away,"Wheeler, Zack",20230418_PHI@CWS_Wheeler_Zack
5,2023-04-18,Philadelphia at Chicago Sox,PHI,CWS,1,https://www.baseball-reference.com/boxes/CHA/C...,away,"Soto, Gregory",20230418_PHI@CWS_Soto_Gregory
6,2023-04-18,Philadelphia at Chicago Sox,PHI,CWS,1,https://www.baseball-reference.com/boxes/CHA/C...,away,"Kimbrel, Craig",20230418_PHI@CWS_Kimbrel_Craig
7,2023-04-18,Philadelphia at Chicago Sox,PHI,CWS,1,https://www.baseball-reference.com/boxes/CHA/C...,away,"Domínguez, Seranthony",20230418_PHI@CWS_Domínguez_Seranthony
8,2023-04-18,Philadelphia at Chicago Sox,PHI,CWS,1,https://www.baseball-reference.com/boxes/CHA/C...,away,"Alvarado, José",20230418_PHI@CWS_Alvarado_José
9,2023-04-18,Cleveland at Detroit,CLE,DET,1,https://www.baseball-reference.com/boxes/DET/D...,home,"Boyd, Matthew",20230418_CLE@DET_Boyd_Matthew


Unnamed: 0,date,teams,away_team,home_team,game_number,bref_url,side,pitcher,game_id
0,2024-04-04,Detroit at NY Mets,DET,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,away,"Mize, Casey",20240404_DET@NYM_Mize_Casey
1,2024-04-04,Detroit at NY Mets,DET,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,away,"Wentz, Joey",20240404_DET@NYM_Wentz_Joey
2,2024-04-04,Detroit at NY Mets,DET,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,away,"Lange, Alex",20240404_DET@NYM_Lange_Alex
3,2024-04-04,Detroit at NY Mets,DET,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,away,"Chafin, Andrew",20240404_DET@NYM_Chafin_Andrew
4,2024-04-04,Detroit at NY Mets,DET,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,away,"Foley, Jason",20240404_DET@NYM_Foley_Jason
5,2024-04-04,Detroit at NY Mets,DET,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,away,"Miller, Shelby",20240404_DET@NYM_Miller_Shelby
6,2024-04-04,Detroit at NY Mets,DET,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,home,"Houser, Adrian",20240404_DET@NYM_Houser_Adrian
7,2024-04-04,Detroit at NY Mets,DET,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,home,"Raley, Brooks",20240404_DET@NYM_Raley_Brooks
8,2024-04-04,Detroit at NY Mets,DET,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,home,"Smith, Drew",20240404_DET@NYM_Smith_Drew
9,2024-04-04,Detroit at NY Mets,DET,NYM,1,https://www.baseball-reference.com/boxes/NYN/N...,home,"Diekman, Jake",20240404_DET@NYM_Diekman_Jake


Unnamed: 0,date,teams,away_team,home_team,game_number,bref_url,side,pitcher,game_id
0,2025-04-06,St. Louis at Boston,STL,BOS,1,https://www.baseball-reference.com/boxes/BOS/B...,away,"Pallante, Andre",20250406_STL@BOS_Pallante_Andre
1,2025-04-06,St. Louis at Boston,STL,BOS,1,https://www.baseball-reference.com/boxes/BOS/B...,away,"Leahy, Kyle",20250406_STL@BOS_Leahy_Kyle
2,2025-04-06,St. Louis at Boston,STL,BOS,1,https://www.baseball-reference.com/boxes/BOS/B...,away,"Romero, JoJo",20250406_STL@BOS_Romero_JoJo
3,2025-04-06,St. Louis at Boston,STL,BOS,1,https://www.baseball-reference.com/boxes/BOS/B...,away,"Maton, Phil",20250406_STL@BOS_Maton_Phil
4,2025-04-06,St. Louis at Boston,STL,BOS,1,https://www.baseball-reference.com/boxes/BOS/B...,away,"Helsley, Ryan",20250406_STL@BOS_Helsley_Ryan
5,2025-04-06,St. Louis at Boston,STL,BOS,1,https://www.baseball-reference.com/boxes/BOS/B...,away,"Fernandez, Ryan",20250406_STL@BOS_Fernandez_Ryan
6,2025-04-06,St. Louis at Boston,STL,BOS,1,https://www.baseball-reference.com/boxes/BOS/B...,home,"Newcomb, Sean",20250406_STL@BOS_Newcomb_Sean
7,2025-04-06,St. Louis at Boston,STL,BOS,1,https://www.baseball-reference.com/boxes/BOS/B...,home,"Weissert, Greg",20250406_STL@BOS_Weissert_Greg
8,2025-04-06,St. Louis at Boston,STL,BOS,1,https://www.baseball-reference.com/boxes/BOS/B...,home,"Wilson, Justin",20250406_STL@BOS_Wilson_Justin
9,2025-04-06,St. Louis at Boston,STL,BOS,1,https://www.baseball-reference.com/boxes/BOS/B...,home,"Slaten, Justin",20250406_STL@BOS_Slaten_Justin


### Unique Headers

The code below gets the unique headers.

In [20]:
unique_ids_22 = double_headers_22["game_id"].unique()
unique_ids_23 = double_headers_23["game_id"].unique()
unique_ids_24 = double_headers_24["game_id"].unique()
unique_ids_25 = double_headers_25["game_id"].unique()

### Merging Doubleheader Game Numbers Into Season Data

This function merges doubleheader metadata into the at-bat dataset by performing a **left join on `game_id`**. It first selects only the relevant fields from the doubleheader table—`game_id`, `game_number`, and `pitcher`—and removes duplicates to ensure a clean one-to-one mapping per game.

The merge then appends `game_number` (and the pitcher label from the doubleheader file) to `at_bats_df` while preserving all rows in the at-bat data. Games that are not part of a doubleheader will simply have missing (`NaN`) values for these added fields.


In [21]:
season_2022 = merge_game_number_and_pitcher(season_2022, double_headers_22)
season_2023 = merge_game_number_and_pitcher(season_2023, double_headers_23)
season_2024 = merge_game_number_and_pitcher(season_2024, double_headers_24)
season_2025 = merge_game_number_and_pitcher(season_2025, double_headers_25)

In [22]:
display(HTML("<h4>Season 2022</h4>")); display(season_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(season_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(season_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(season_2025.head(5))

Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id,game_number,pitcher
0,2022-10-05,BAL,TOR,9,Top,1,2,5,4,field_out,hit_into_play,624415,"Baker, Bryan",20221005_TOR@BAL_Baker_Bryan,1.0,"Baker, Bryan"
1,2022-10-05,BAL,TOR,9,Top,5,1,5,4,strikeout,swinging_strike,643376,"Baker, Bryan",20221005_TOR@BAL_Baker_Bryan,1.0,"Baker, Bryan"
2,2022-10-05,BAL,TOR,9,Top,4,1,5,4,,ball,643376,"Baker, Bryan",20221005_TOR@BAL_Baker_Bryan,1.0,"Baker, Bryan"
3,2022-10-05,BAL,TOR,9,Top,3,1,5,4,,swinging_strike,643376,"Baker, Bryan",20221005_TOR@BAL_Baker_Bryan,1.0,"Baker, Bryan"
4,2022-10-05,BAL,TOR,9,Top,2,1,5,4,,called_strike,643376,"Baker, Bryan",20221005_TOR@BAL_Baker_Bryan,1.0,"Baker, Bryan"


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id,game_number,pitcher
0,2023-10-01,BAL,BOS,9,Bot,6,2,1,6,field_out,hit_into_play,677008,"Robertson, Nick",20231001_BOS@BAL_Robertson_Nick,,
1,2023-10-01,BAL,BOS,9,Bot,5,2,1,6,,foul,677008,"Robertson, Nick",20231001_BOS@BAL_Robertson_Nick,,
2,2023-10-01,BAL,BOS,9,Bot,4,2,1,6,,ball,677008,"Robertson, Nick",20231001_BOS@BAL_Robertson_Nick,,
3,2023-10-01,BAL,BOS,9,Bot,3,2,1,6,,ball,677008,"Robertson, Nick",20231001_BOS@BAL_Robertson_Nick,,
4,2023-10-01,BAL,BOS,9,Bot,2,2,1,6,,swinging_strike,677008,"Robertson, Nick",20231001_BOS@BAL_Robertson_Nick,,


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id,game_number,pitcher
0,2024-09-30,ATL,NYM,9,Bot,5,2,7,8,field_out,hit_into_play,518595,"Díaz, Edwin",20240930_NYM@ATL_Díaz_Edwin,,
1,2024-09-30,ATL,NYM,9,Bot,4,2,7,8,,ball,518595,"Díaz, Edwin",20240930_NYM@ATL_Díaz_Edwin,,
2,2024-09-30,ATL,NYM,9,Bot,3,2,7,8,,swinging_strike,518595,"Díaz, Edwin",20240930_NYM@ATL_Díaz_Edwin,,
3,2024-09-30,ATL,NYM,9,Bot,2,2,7,8,,ball,518595,"Díaz, Edwin",20240930_NYM@ATL_Díaz_Edwin,,
4,2024-09-30,ATL,NYM,9,Bot,1,2,7,8,,swinging_strike,518595,"Díaz, Edwin",20240930_NYM@ATL_Díaz_Edwin,,


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id,game_number,pitcher
0,2025-09-28,BOS,DET,9,Top,1,2,4,3,field_out,hit_into_play,678009,"Weissert, Greg",20250928_DET@BOS_Weissert_Greg,,
1,2025-09-28,BOS,DET,9,Top,4,1,4,3,strikeout,called_strike,668670,"Weissert, Greg",20250928_DET@BOS_Weissert_Greg,,
2,2025-09-28,BOS,DET,9,Top,3,1,4,3,,foul,668670,"Weissert, Greg",20250928_DET@BOS_Weissert_Greg,,
3,2025-09-28,BOS,DET,9,Top,2,1,4,3,,swinging_strike,668670,"Weissert, Greg",20250928_DET@BOS_Weissert_Greg,,
4,2025-09-28,BOS,DET,9,Top,1,1,4,3,,called_strike,668670,"Weissert, Greg",20250928_DET@BOS_Weissert_Greg,,


### Validating Doubleheader Pitcher Data

Below, I verify that pitcher identifiers are not missing for games flagged as doubleheaders by counting `NaN` values in the `pitcher` column where `game_number` is present. A count of zero indicates that all doubleheader games have valid pitcher identifiers, ensuring starting pitchers can be correctly identified for subsequent analysis.


In [23]:
for year in range(2022, 2026):
    df = globals()[f"season_{year}"]

    na_count = (
        df.loc[df["game_number"].notna(), "pitcher"]
        .isna()
        .sum()
    )

    print(f"{year}: pitcher NA count = {na_count}")

2022: pitcher NA count = 0
2023: pitcher NA count = 0
2024: pitcher NA count = 0
2025: pitcher NA count = 0


### Trimming `game_id` to a Game-Level Identifier

Now that the doubleheader merge is validated (i.e., no missing pitchers), we no longer need the pitcher component embedded in `game_id`. This function therefore trims `game_id` from a pitcher-specific identifier (e.g., `YYYYMMDD_AWAY@HOME_Pitcher_Name`) to a game-level identifier (`YYYYMMDD_AWAY@HOME`). It does this by splitting on underscores, keeping only the date and matchup portions, and overwriting the original `game_id` column with the trimmed version.


In [24]:
season_2022 = trim_game_id_inplace(season_2022)
season_2023 = trim_game_id_inplace(season_2023)
season_2024 = trim_game_id_inplace(season_2024)
season_2025 = trim_game_id_inplace(season_2025)

display(HTML("<h4>Season 2022</h4>")); display(season_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(season_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(season_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(season_2025.head(5))

Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id,game_number,pitcher
0,2022-10-05,BAL,TOR,9,Top,1,2,5,4,field_out,hit_into_play,624415,"Baker, Bryan",20221005_TOR@BAL,1.0,"Baker, Bryan"
1,2022-10-05,BAL,TOR,9,Top,5,1,5,4,strikeout,swinging_strike,643376,"Baker, Bryan",20221005_TOR@BAL,1.0,"Baker, Bryan"
2,2022-10-05,BAL,TOR,9,Top,4,1,5,4,,ball,643376,"Baker, Bryan",20221005_TOR@BAL,1.0,"Baker, Bryan"
3,2022-10-05,BAL,TOR,9,Top,3,1,5,4,,swinging_strike,643376,"Baker, Bryan",20221005_TOR@BAL,1.0,"Baker, Bryan"
4,2022-10-05,BAL,TOR,9,Top,2,1,5,4,,called_strike,643376,"Baker, Bryan",20221005_TOR@BAL,1.0,"Baker, Bryan"


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id,game_number,pitcher
0,2023-10-01,BAL,BOS,9,Bot,6,2,1,6,field_out,hit_into_play,677008,"Robertson, Nick",20231001_BOS@BAL,,
1,2023-10-01,BAL,BOS,9,Bot,5,2,1,6,,foul,677008,"Robertson, Nick",20231001_BOS@BAL,,
2,2023-10-01,BAL,BOS,9,Bot,4,2,1,6,,ball,677008,"Robertson, Nick",20231001_BOS@BAL,,
3,2023-10-01,BAL,BOS,9,Bot,3,2,1,6,,ball,677008,"Robertson, Nick",20231001_BOS@BAL,,
4,2023-10-01,BAL,BOS,9,Bot,2,2,1,6,,swinging_strike,677008,"Robertson, Nick",20231001_BOS@BAL,,


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id,game_number,pitcher
0,2024-09-30,ATL,NYM,9,Bot,5,2,7,8,field_out,hit_into_play,518595,"Díaz, Edwin",20240930_NYM@ATL,,
1,2024-09-30,ATL,NYM,9,Bot,4,2,7,8,,ball,518595,"Díaz, Edwin",20240930_NYM@ATL,,
2,2024-09-30,ATL,NYM,9,Bot,3,2,7,8,,swinging_strike,518595,"Díaz, Edwin",20240930_NYM@ATL,,
3,2024-09-30,ATL,NYM,9,Bot,2,2,7,8,,ball,518595,"Díaz, Edwin",20240930_NYM@ATL,,
4,2024-09-30,ATL,NYM,9,Bot,1,2,7,8,,swinging_strike,518595,"Díaz, Edwin",20240930_NYM@ATL,,


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id,game_number,pitcher
0,2025-09-28,BOS,DET,9,Top,1,2,4,3,field_out,hit_into_play,678009,"Weissert, Greg",20250928_DET@BOS,,
1,2025-09-28,BOS,DET,9,Top,4,1,4,3,strikeout,called_strike,668670,"Weissert, Greg",20250928_DET@BOS,,
2,2025-09-28,BOS,DET,9,Top,3,1,4,3,,foul,668670,"Weissert, Greg",20250928_DET@BOS,,
3,2025-09-28,BOS,DET,9,Top,2,1,4,3,,swinging_strike,668670,"Weissert, Greg",20250928_DET@BOS,,
4,2025-09-28,BOS,DET,9,Top,1,1,4,3,,called_strike,668670,"Weissert, Greg",20250928_DET@BOS,,


### Appending Game Number to the ID

Now that we have a clean game-level `game_id` and have successfully merged in `game_number`, we append the game number to the identifier to clearly distinguish **Game 1 vs. Game 2** of a doubleheader. This prevents two games with the same date and matchup from sharing the same ID and ensures that subsequent merges and exploratory analysis treat them as separate events.

The function below updates `game_id` only when `game_number` is present (non-missing), converting identifiers like `YYYYMMDD_AWAY@HOME` into `YYYYMMDD_AWAY@HOME_1` (or `_2`). For non-doubleheader games where `game_number` is missing, `game_id` is left unchanged.


In [25]:
season_2022 = append_game_number_to_game_id(season_2022)
season_2023 = append_game_number_to_game_id(season_2023)
season_2024 = append_game_number_to_game_id(season_2024)
season_2025 = append_game_number_to_game_id(season_2025)

display(HTML("<h4>Season 2022</h4>")); display(season_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(season_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(season_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(season_2025.head(5))

Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id
0,2022-10-05,BAL,TOR,9,Top,1,2,5,4,field_out,hit_into_play,624415,"Baker, Bryan",20221005_TOR@BAL_1
1,2022-10-05,BAL,TOR,9,Top,5,1,5,4,strikeout,swinging_strike,643376,"Baker, Bryan",20221005_TOR@BAL_1
2,2022-10-05,BAL,TOR,9,Top,4,1,5,4,,ball,643376,"Baker, Bryan",20221005_TOR@BAL_1
3,2022-10-05,BAL,TOR,9,Top,3,1,5,4,,swinging_strike,643376,"Baker, Bryan",20221005_TOR@BAL_1
4,2022-10-05,BAL,TOR,9,Top,2,1,5,4,,called_strike,643376,"Baker, Bryan",20221005_TOR@BAL_1


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id
0,2023-10-01,BAL,BOS,9,Bot,6,2,1,6,field_out,hit_into_play,677008,"Robertson, Nick",20231001_BOS@BAL
1,2023-10-01,BAL,BOS,9,Bot,5,2,1,6,,foul,677008,"Robertson, Nick",20231001_BOS@BAL
2,2023-10-01,BAL,BOS,9,Bot,4,2,1,6,,ball,677008,"Robertson, Nick",20231001_BOS@BAL
3,2023-10-01,BAL,BOS,9,Bot,3,2,1,6,,ball,677008,"Robertson, Nick",20231001_BOS@BAL
4,2023-10-01,BAL,BOS,9,Bot,2,2,1,6,,swinging_strike,677008,"Robertson, Nick",20231001_BOS@BAL


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id
0,2024-09-30,ATL,NYM,9,Bot,5,2,7,8,field_out,hit_into_play,518595,"Díaz, Edwin",20240930_NYM@ATL
1,2024-09-30,ATL,NYM,9,Bot,4,2,7,8,,ball,518595,"Díaz, Edwin",20240930_NYM@ATL
2,2024-09-30,ATL,NYM,9,Bot,3,2,7,8,,swinging_strike,518595,"Díaz, Edwin",20240930_NYM@ATL
3,2024-09-30,ATL,NYM,9,Bot,2,2,7,8,,ball,518595,"Díaz, Edwin",20240930_NYM@ATL
4,2024-09-30,ATL,NYM,9,Bot,1,2,7,8,,swinging_strike,518595,"Díaz, Edwin",20240930_NYM@ATL


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id
0,2025-09-28,BOS,DET,9,Top,1,2,4,3,field_out,hit_into_play,678009,"Weissert, Greg",20250928_DET@BOS
1,2025-09-28,BOS,DET,9,Top,4,1,4,3,strikeout,called_strike,668670,"Weissert, Greg",20250928_DET@BOS
2,2025-09-28,BOS,DET,9,Top,3,1,4,3,,foul,668670,"Weissert, Greg",20250928_DET@BOS
3,2025-09-28,BOS,DET,9,Top,2,1,4,3,,swinging_strike,668670,"Weissert, Greg",20250928_DET@BOS
4,2025-09-28,BOS,DET,9,Top,1,1,4,3,,called_strike,668670,"Weissert, Greg",20250928_DET@BOS


## Pitcher Metrics

Now, we can begin creating the features needed for modeling. We start by constructing pitching features based on the **starting pitcher**, including:

- **FIP**
- **WHIP**
- **K9**
- **HR9**

For each metric, we compute rolling **3-day** and **7-day** values and then calculate the difference between the home and away teams. In addition, we compute rolling **FIP** for the remaining bullpen.

Formal definitions of each pitching metric and implementation details are provided in a later section.


### Starting Pitcher Indicator

First, we create an indicator to identify the starting pitcher for each game in every season. This allows us to separate starter performance from bullpen performance when constructing pitching features.


In [26]:
for y in range(2021, 2026):
    name = f"season_{y}"
    df = globals().get(name)
    if df is None:
        print(f"{name}: (not found)")
        continue

    globals()[name] = add_starter_indicator_pitchlevel(df)
    print(f"{name}: starter enforced")


season_2021: starter enforced
season_2022: starter enforced
season_2023: starter enforced
season_2024: starter enforced
season_2025: starter enforced


In [27]:
display(HTML("<h4>Season 2022</h4>")); display(season_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(season_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(season_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(season_2025.head(5))

Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id,pitching_team,is_starter
0,2022-04-07,ATL,CIN,1,Top,1,0,0,0,,called_strike,663697,"Fried, Max",20220407_CIN@ATL,ATL,1
1,2022-04-07,ATL,CIN,1,Top,2,0,0,0,,called_strike,663697,"Fried, Max",20220407_CIN@ATL,ATL,1
2,2022-04-07,ATL,CIN,1,Top,3,0,0,0,strikeout,swinging_strike,663697,"Fried, Max",20220407_CIN@ATL,ATL,1
3,2022-04-07,ATL,CIN,1,Top,1,1,0,0,,called_strike,606157,"Fried, Max",20220407_CIN@ATL,ATL,1
4,2022-04-07,ATL,CIN,1,Top,2,1,0,0,,ball,606157,"Fried, Max",20220407_CIN@ATL,ATL,1


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id,pitching_team,is_starter
0,2023-03-30,WSH,ATL,1,Bot,1,0,0,0,,ball,608841,"Fried, Max",20230330_ATL@WSH,ATL,1
1,2023-03-30,WSH,ATL,1,Bot,1,0,0,0,single,hit_into_play,657041,"Fried, Max",20230330_ATL@WSH,ATL,1
2,2023-03-30,WSH,ATL,1,Bot,2,0,0,0,,foul,608841,"Fried, Max",20230330_ATL@WSH,ATL,1
3,2023-03-30,WSH,ATL,1,Bot,3,0,0,0,grounded_into_double_play,hit_into_play,608841,"Fried, Max",20230330_ATL@WSH,ATL,1
4,2023-03-30,WSH,ATL,1,Bot,1,2,0,0,,swinging_strike,600869,"Fried, Max",20230330_ATL@WSH,ATL,1


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id,pitching_team,is_starter
0,2024-03-20,SD,LAD,1,Bot,1,0,0,0,,ball,593428,"Glasnow, Tyler",20240320_LAD@SD,LAD,1
1,2024-03-20,SD,LAD,1,Bot,2,0,0,0,,called_strike,593428,"Glasnow, Tyler",20240320_LAD@SD,LAD,1
2,2024-03-20,SD,LAD,1,Bot,3,0,0,0,,ball,593428,"Glasnow, Tyler",20240320_LAD@SD,LAD,1
3,2024-03-20,SD,LAD,1,Bot,4,0,0,0,,called_strike,593428,"Glasnow, Tyler",20240320_LAD@SD,LAD,1
4,2024-03-20,SD,LAD,1,Bot,5,0,0,0,,ball,593428,"Glasnow, Tyler",20240320_LAD@SD,LAD,1


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,description,batter_id,pitcher_name,game_id,pitching_team,is_starter
0,2025-03-18,CHC,LAD,1,Top,1,0,0,0,,called_strike,660271,"Imanaga, Shota",20250318_LAD@CHC,CHC,1
1,2025-03-18,CHC,LAD,1,Top,2,0,0,0,,ball,660271,"Imanaga, Shota",20250318_LAD@CHC,CHC,1
2,2025-03-18,CHC,LAD,1,Top,3,0,0,0,field_out,hit_into_play,660271,"Imanaga, Shota",20250318_LAD@CHC,CHC,1
3,2025-03-18,CHC,LAD,1,Top,1,1,0,0,,ball,669242,"Imanaga, Shota",20250318_LAD@CHC,CHC,1
4,2025-03-18,CHC,LAD,1,Top,2,1,0,0,,swinging_strike,669242,"Imanaga, Shota",20250318_LAD@CHC,CHC,1


### Unique Events and Descriptions

**MOVE TO THIS ABOVE**

In [28]:
for y in range(2021, 2026):
    df = globals().get(f"season_{y}")
    if df is None:
        print(f"season_{y}: (not loaded)")
        continue

    events_unique = sorted(df["events"].dropna().astype(str).unique())
    desc_unique   = sorted(df["description"].dropna().astype(str).unique())

    print(f"\n=== season_{y} ===")
    print(f"events unique ({len(events_unique)}):")
    print(events_unique)
    print(f"\ndescription unique ({len(desc_unique)}):")
    print(desc_unique)


=== season_2021 ===
events unique (23):
['catcher_interf', 'double', 'double_play', 'field_error', 'field_out', 'fielders_choice', 'fielders_choice_out', 'force_out', 'grounded_into_double_play', 'hit_by_pitch', 'home_run', 'intent_walk', 'sac_bunt', 'sac_bunt_double_play', 'sac_fly', 'sac_fly_double_play', 'single', 'strikeout', 'strikeout_double_play', 'triple', 'triple_play', 'truncated_pa', 'walk']

description unique (15):
['automatic_ball', 'ball', 'blocked_ball', 'bunt_foul_tip', 'called_strike', 'foul', 'foul_bunt', 'foul_pitchout', 'foul_tip', 'hit_by_pitch', 'hit_into_play', 'missed_bunt', 'pitchout', 'swinging_strike', 'swinging_strike_blocked']

=== season_2022 ===
events unique (22):
['catcher_interf', 'double', 'double_play', 'field_error', 'field_out', 'fielders_choice', 'fielders_choice_out', 'force_out', 'grounded_into_double_play', 'hit_by_pitch', 'home_run', 'intent_walk', 'sac_bunt', 'sac_fly', 'sac_fly_double_play', 'single', 'strikeout', 'strikeout_double_play', 

### Plate Appearances

Below, I filter the Statcast pitch-level data to retain only **plate appearance–ending** events (e.g., hits, walks, strikeouts, outs, and sacrifices). This ensures each plate appearance is counted once and excludes incomplete plate appearances labeled as `truncated_pa`. These filtered plate appearances are then used to calculate features for both pitchers and batters later in the notebook.


In [29]:
for y in range(2021, 2026):
    season_name = f"season_{y}"
    pa_name = f"pa_{y}"

    df = globals().get(season_name)
    if df is None:
        print(f"{season_name}: (not loaded)")
        continue

    globals()[pa_name] = filter_plate_appearances(df)
    print(f"{pa_name}: {len(globals()[pa_name]):,} rows kept")


pa_2021: 181,816 rows kept
pa_2022: 182,147 rows kept
pa_2023: 184,163 rows kept
pa_2024: 182,516 rows kept
pa_2025: 182,949 rows kept


### Pitching Indicators

Next, we create indicator variables needed to compute rolling **FIP**, **WHIP**, **K9**, and **HR9**. These pitching metrics are defined as follows:

$$\text{WHIP} = \frac{H + BB + HBP}{IP}$$

$$\text{K/9} = \frac{9 \times K}{IP}$$

$$\text{HR/9} = \frac{9 \times HR}{IP}$$

$$\text{FIP} = \frac{13 \times HR + 3 \times (BB + HBP) - 2 \times K}{IP}$$


where **IP** denotes innings pitched, computed as total outs divided by three. These formulas are applied to rolling aggregates of the underlying event indicators to construct the final pitching features.


**NOTE TO SELF**: Explain what FIP, WHIP, HR9, and K9 are earlier in the notebook, pferable at begining of pitching section

In [30]:
for y in range(2021, 2026):
    src_name = f"pa_{y}"
    dst_name = f"pa_pitcher_{y}"

    df = globals().get(src_name)
    if df is None:
        print(f"{src_name}: (not found)")
        continue

    globals()[dst_name] = add_pitching_indicators(df.copy())
    print(f"{dst_name}: indicators added (from {src_name})")


pa_pitcher_2021: indicators added (from pa_2021)
pa_pitcher_2022: indicators added (from pa_2022)
pa_pitcher_2023: indicators added (from pa_2023)
pa_pitcher_2024: indicators added (from pa_2024)
pa_pitcher_2025: indicators added (from pa_2025)


In [31]:
display(HTML("<h4>Season 2021</h4>")); display(pa_pitcher_2021.head(5))
display(HTML("<h4>Season 2022</h4>")); display(pa_pitcher_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(pa_pitcher_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(pa_pitcher_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(pa_pitcher_2025.head(5))

Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
7,2021-04-01,PHI,ATL,8,Top,4,0,2,2,walk,...,20210401_ATL@PHI_Alvarado_José,PHI,1,True,0,1,0,0,0,0
8,2021-04-01,PHI,ATL,8,Top,5,0,2,2,strikeout,...,20210401_ATL@PHI_Alvarado_José,PHI,1,True,0,0,0,1,0,1
10,2021-04-01,PHI,ATL,8,Top,1,1,2,2,single,...,20210401_ATL@PHI_Alvarado_José,PHI,1,True,0,0,0,0,1,0
12,2021-04-01,PHI,ATL,8,Top,3,1,2,2,strikeout,...,20210401_ATL@PHI_Alvarado_José,PHI,1,True,0,0,0,1,0,1
18,2021-04-01,PHI,ATL,8,Top,3,2,2,2,hit_by_pitch,...,20210401_ATL@PHI_Alvarado_José,PHI,1,True,0,0,1,0,0,0


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
2,2022-04-07,ATL,CIN,1,Top,3,0,0,0,strikeout,...,20220407_CIN@ATL,ATL,1,True,0,0,0,1,0,1
7,2022-04-07,ATL,CIN,1,Top,5,1,0,0,strikeout,...,20220407_CIN@ATL,ATL,1,True,0,0,0,1,0,1
10,2022-04-07,ATL,CIN,1,Top,3,2,0,0,field_out,...,20220407_CIN@ATL,ATL,1,True,0,0,0,0,0,1
15,2022-04-07,ATL,CIN,2,Top,5,0,0,0,strikeout,...,20220407_CIN@ATL,ATL,1,True,0,0,0,1,0,1
21,2022-04-07,ATL,CIN,2,Top,2,1,0,0,hit_by_pitch,...,20220407_CIN@ATL,ATL,1,True,0,0,1,0,0,0


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
1,2023-03-30,WSH,ATL,1,Bot,1,0,0,0,single,...,20230330_ATL@WSH,ATL,1,True,0,0,0,0,1,0
3,2023-03-30,WSH,ATL,1,Bot,3,0,0,0,grounded_into_double_play,...,20230330_ATL@WSH,ATL,1,True,0,0,0,0,0,2
7,2023-03-30,WSH,ATL,1,Bot,4,2,0,0,field_out,...,20230330_ATL@WSH,ATL,1,True,0,0,0,0,0,1
9,2023-03-30,WSH,ATL,2,Bot,1,0,0,3,double,...,20230330_ATL@WSH,ATL,1,True,0,0,0,0,1,0
13,2023-03-30,WSH,ATL,2,Bot,3,0,0,3,sac_fly,...,20230330_ATL@WSH,ATL,1,True,0,0,0,0,0,1


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
5,2024-03-20,SD,LAD,1,Bot,6,0,0,0,field_out,...,20240320_LAD@SD,LAD,1,True,0,0,0,0,0,1
6,2024-03-20,SD,LAD,1,Bot,1,1,0,0,field_out,...,20240320_LAD@SD,LAD,1,True,0,0,0,0,0,1
12,2024-03-20,SD,LAD,1,Bot,6,2,0,0,strikeout,...,20240320_LAD@SD,LAD,1,True,0,0,0,1,0,1
13,2024-03-20,SD,LAD,2,Bot,1,0,0,0,field_out,...,20240320_LAD@SD,LAD,1,True,0,0,0,0,0,1
18,2024-03-20,SD,LAD,2,Bot,5,1,0,0,field_out,...,20240320_LAD@SD,LAD,1,True,0,0,0,0,0,1


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
2,2025-03-18,CHC,LAD,1,Top,3,0,0,0,field_out,...,20250318_LAD@CHC,CHC,1,True,0,0,0,0,0,1
5,2025-03-18,CHC,LAD,1,Top,3,1,0,0,field_out,...,20250318_LAD@CHC,CHC,1,True,0,0,0,0,0,1
8,2025-03-18,CHC,LAD,1,Top,3,2,0,0,strikeout,...,20250318_LAD@CHC,CHC,1,True,0,0,0,1,0,1
21,2025-03-18,CHC,LAD,2,Top,5,0,0,0,field_out,...,20250318_LAD@CHC,CHC,1,True,0,0,0,0,0,1
27,2025-03-18,CHC,LAD,2,Top,7,0,0,0,walk,...,20250318_LAD@CHC,CHC,1,True,0,1,0,0,0,0


### Splitting Starters and Bullpen

To construct pitching features, we separate **starting pitcher** plate appearances from those thrown by the **bullpen**. 

By splitting the data into starter and bullpen subsets, we can compute rolling metrics for the starting pitcher at the individual level, while separately aggregating bullpen performance at the team level. This ensures that each set of pitching features accurately reflects the intended pitcher role and avoids mixing starter and relief appearances.

-----

**Move this description elsewhere**: This distinction is necessary because starting pitchers and relievers play fundamentally different roles, and their contributions are used differently when computing rolling pitching metrics.



In [32]:
for y in range(2021, 2026):
    df = globals().get(f"pa_pitcher_{y}")
    if df is None:
        print(f"pa_pitcher_{y}: (not found)")
        continue

    starter_df, bullpen_df = split_starter_bullpen(df, validate=True)

    globals()[f"pa_starter_{y}"] = starter_df
    globals()[f"pa_bullpen_{y}"] = bullpen_df

    print(f"{y}: starter={len(starter_df):,} | bullpen={len(bullpen_df):,}")


2021: starter=181,816 | bullpen=0
2022: starter=106,836 | bullpen=75,311
2023: starter=106,479 | bullpen=77,684
2024: starter=107,055 | bullpen=75,461
2025: starter=106,611 | bullpen=76,338


In [33]:
display(HTML("<h4>Starting Pitcher Season 2021</h4>")); display(pa_starter_2021.head(5))
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(pa_starter_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(pa_starter_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(pa_starter_2024.head(5))
display(HTML("<h4>Starting PitcherSeason 2025</h4>")); display(pa_starter_2025.head(5))
print(" ")
display(HTML("<h4>Bullpen Pitcher Season 2021</h4>")); display(pa_bullpen_2021.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2022</h4>")); display(pa_bullpen_2022.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2023</h4>")); display(pa_bullpen_2023.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2024</h4>")); display(pa_bullpen_2024.head(5))
display(HTML("<h4>Bullpen PitcherSeason 2025</h4>")); display(pa_bullpen_2025.head(5))


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
7,2021-04-01,PHI,ATL,8,Top,4,0,2,2,walk,...,20210401_ATL@PHI_Alvarado_José,PHI,1,True,0,1,0,0,0,0
8,2021-04-01,PHI,ATL,8,Top,5,0,2,2,strikeout,...,20210401_ATL@PHI_Alvarado_José,PHI,1,True,0,0,0,1,0,1
10,2021-04-01,PHI,ATL,8,Top,1,1,2,2,single,...,20210401_ATL@PHI_Alvarado_José,PHI,1,True,0,0,0,0,1,0
12,2021-04-01,PHI,ATL,8,Top,3,1,2,2,strikeout,...,20210401_ATL@PHI_Alvarado_José,PHI,1,True,0,0,0,1,0,1
18,2021-04-01,PHI,ATL,8,Top,3,2,2,2,hit_by_pitch,...,20210401_ATL@PHI_Alvarado_José,PHI,1,True,0,0,1,0,0,0


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
2,2022-04-07,ATL,CIN,1,Top,3,0,0,0,strikeout,...,20220407_CIN@ATL,ATL,1,True,0,0,0,1,0,1
7,2022-04-07,ATL,CIN,1,Top,5,1,0,0,strikeout,...,20220407_CIN@ATL,ATL,1,True,0,0,0,1,0,1
10,2022-04-07,ATL,CIN,1,Top,3,2,0,0,field_out,...,20220407_CIN@ATL,ATL,1,True,0,0,0,0,0,1
15,2022-04-07,ATL,CIN,2,Top,5,0,0,0,strikeout,...,20220407_CIN@ATL,ATL,1,True,0,0,0,1,0,1
21,2022-04-07,ATL,CIN,2,Top,2,1,0,0,hit_by_pitch,...,20220407_CIN@ATL,ATL,1,True,0,0,1,0,0,0


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
1,2023-03-30,WSH,ATL,1,Bot,1,0,0,0,single,...,20230330_ATL@WSH,ATL,1,True,0,0,0,0,1,0
3,2023-03-30,WSH,ATL,1,Bot,3,0,0,0,grounded_into_double_play,...,20230330_ATL@WSH,ATL,1,True,0,0,0,0,0,2
7,2023-03-30,WSH,ATL,1,Bot,4,2,0,0,field_out,...,20230330_ATL@WSH,ATL,1,True,0,0,0,0,0,1
9,2023-03-30,WSH,ATL,2,Bot,1,0,0,3,double,...,20230330_ATL@WSH,ATL,1,True,0,0,0,0,1,0
13,2023-03-30,WSH,ATL,2,Bot,3,0,0,3,sac_fly,...,20230330_ATL@WSH,ATL,1,True,0,0,0,0,0,1


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
5,2024-03-20,SD,LAD,1,Bot,6,0,0,0,field_out,...,20240320_LAD@SD,LAD,1,True,0,0,0,0,0,1
6,2024-03-20,SD,LAD,1,Bot,1,1,0,0,field_out,...,20240320_LAD@SD,LAD,1,True,0,0,0,0,0,1
12,2024-03-20,SD,LAD,1,Bot,6,2,0,0,strikeout,...,20240320_LAD@SD,LAD,1,True,0,0,0,1,0,1
13,2024-03-20,SD,LAD,2,Bot,1,0,0,0,field_out,...,20240320_LAD@SD,LAD,1,True,0,0,0,0,0,1
18,2024-03-20,SD,LAD,2,Bot,5,1,0,0,field_out,...,20240320_LAD@SD,LAD,1,True,0,0,0,0,0,1


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
2,2025-03-18,CHC,LAD,1,Top,3,0,0,0,field_out,...,20250318_LAD@CHC,CHC,1,True,0,0,0,0,0,1
5,2025-03-18,CHC,LAD,1,Top,3,1,0,0,field_out,...,20250318_LAD@CHC,CHC,1,True,0,0,0,0,0,1
8,2025-03-18,CHC,LAD,1,Top,3,2,0,0,strikeout,...,20250318_LAD@CHC,CHC,1,True,0,0,0,1,0,1
21,2025-03-18,CHC,LAD,2,Top,5,0,0,0,field_out,...,20250318_LAD@CHC,CHC,1,True,0,0,0,0,0,1
27,2025-03-18,CHC,LAD,2,Top,7,0,0,0,walk,...,20250318_LAD@CHC,CHC,1,True,0,1,0,0,0,0


 


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
92,2022-04-07,ATL,CIN,6,Top,4,2,1,6,single,...,20220407_CIN@ATL,ATL,0,True,0,0,0,0,1,0
96,2022-04-07,ATL,CIN,6,Top,5,2,1,3,home_run,...,20220407_CIN@ATL,ATL,0,True,1,0,0,0,1,0
99,2022-04-07,ATL,CIN,6,Top,7,2,1,6,strikeout,...,20220407_CIN@ATL,ATL,0,True,0,0,0,1,0,1
104,2022-04-07,ATL,CIN,7,Top,5,0,1,6,strikeout,...,20220407_CIN@ATL,ATL,0,True,0,0,0,1,0,1
107,2022-04-07,ATL,CIN,7,Top,3,1,1,6,field_out,...,20220407_CIN@ATL,ATL,0,True,0,0,0,0,0,1


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
48,2023-03-30,WSH,ATL,4,Bot,3,1,1,4,single,...,20230330_ATL@WSH,ATL,0,True,0,0,0,0,1,0
50,2023-03-30,WSH,ATL,4,Bot,5,1,1,4,force_out,...,20230330_ATL@WSH,ATL,0,True,0,0,0,0,0,1
51,2023-03-30,WSH,ATL,4,Bot,1,2,1,4,field_out,...,20230330_ATL@WSH,ATL,0,True,0,0,0,0,0,1
55,2023-03-30,WSH,ATL,5,Bot,2,0,1,4,single,...,20230330_ATL@WSH,ATL,0,True,0,0,0,0,1,0
59,2023-03-30,WSH,ATL,5,Bot,6,0,1,4,field_out,...,20230330_ATL@WSH,ATL,0,True,0,0,0,0,0,1


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
80,2024-03-20,SD,LAD,6,Bot,4,0,2,1,strikeout,...,20240320_LAD@SD,LAD,0,True,0,0,0,1,0,1
83,2024-03-20,SD,LAD,6,Bot,3,1,2,1,field_out,...,20240320_LAD@SD,LAD,0,True,0,0,0,0,0,1
87,2024-03-20,SD,LAD,6,Bot,4,2,2,1,field_out,...,20240320_LAD@SD,LAD,0,True,0,0,0,0,0,1
95,2024-03-20,SD,LAD,7,Bot,8,0,2,1,strikeout,...,20240320_LAD@SD,LAD,0,True,0,0,0,1,0,1
97,2024-03-20,SD,LAD,7,Bot,2,1,2,1,field_out,...,20240320_LAD@SD,LAD,0,True,0,0,0,0,0,1


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,game_id,pitching_team,is_starter,is_pa_countable,is_hr,is_bb,is_hbp,is_k,is_h,outs
72,2025-03-18,CHC,LAD,5,Top,4,0,1,0,strikeout,...,20250318_LAD@CHC,CHC,0,True,0,0,0,1,0,1
74,2025-03-18,CHC,LAD,5,Top,1,1,1,0,single,...,20250318_LAD@CHC,CHC,0,True,0,0,0,0,1,0
80,2025-03-18,CHC,LAD,5,Top,3,1,1,1,force_out,...,20250318_LAD@CHC,CHC,0,True,0,0,0,0,0,1
83,2025-03-18,CHC,LAD,5,Top,4,1,1,0,single,...,20250318_LAD@CHC,CHC,0,True,0,0,0,0,1,0
85,2025-03-18,CHC,LAD,5,Top,5,1,1,0,walk,...,20250318_LAD@CHC,CHC,0,True,0,1,0,0,0,0


### Aggregating Pitching Game Lines

To calculate pitching features, we first aggregate the pitch-level (plate appearance) data to a **game-level pitching line**. This step converts many plate appearance rows into a single summary row per game, containing the core inputs needed for rolling metrics (IP, H, BB, HBP, K, HR).

We use two aggregation levels depending on the feature set: for **starting pitchers**, we aggregate at the individual pitcher-by-game level, while for the **bullpen**, we aggregate at the team-by-game level. Creating these game lines makes it straightforward to compute rolling 3-day and 7-day pitching metrics and then compare home vs. away performance.


In [35]:
for year in range(2021, 2026):
    globals()[f"starter_lines_{year}"] = aggregate_pitching_game_lines(
        globals()[f"pa_starter_{year}"],
        pitcher_id_col="pitcher_name"
    )

for year in range(2021, 2026):
    globals()[f"bullpen_lines_{year}"] = aggregate_pitching_game_lines(
        globals()[f"pa_bullpen_{year}"]
    )

In [36]:
display(HTML("<h4>Starting Pitcher Season 2021</h4>")); display(starter_lines_2021.head(5))
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(starter_lines_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(starter_lines_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(starter_lines_2024.head(5))
display(HTML("<h4>Starting Pitcher Season 2025</h4>")); display(starter_lines_2025.head(5))
print(" ")
display(HTML("<h4>Bullpen Pitcher Season 2021</h4>")); display(bullpen_lines_2021.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2022</h4>")); display(bullpen_lines_2022.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2023</h4>")); display(bullpen_lines_2023.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2024</h4>")); display(bullpen_lines_2024.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2025</h4>")); display(bullpen_lines_2025.head(5))


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,K,HR
0,20210401_ATL@PHI_Alvarado_José,2021-04-01,PHI,1,starter,"Alvarado, José",1.0,1,1,1,3,0
1,20210401_ATL@PHI_Bradley_Archie,2021-04-01,PHI,1,starter,"Bradley, Archie",0.333333,0,1,0,0,0
2,20210401_ATL@PHI_Brogdon_Connor,2021-04-01,PHI,1,starter,"Brogdon, Connor",1.0,0,0,0,0,0
3,20210401_ATL@PHI_Fried_Max,2021-04-01,ATL,0,starter,"Fried, Max",4.666667,6,2,1,8,0
4,20210401_ATL@PHI_Jones_Nate,2021-04-01,ATL,0,starter,"Jones, Nate",0.666667,1,1,0,0,0


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,K,HR
0,20220407_CIN@ATL,2022-04-07,ATL,1,starter,"Fried, Max",5.666667,8,1,1,5,0
1,20220407_CIN@ATL,2022-04-07,CIN,0,starter,"Mahle, Tyler",5.0,3,2,0,7,0
2,20220407_CLE@KC,2022-04-07,CLE,0,starter,"Bieber, Shane",4.666667,2,0,0,4,0
3,20220407_CLE@KC,2022-04-07,KC,1,starter,"Greinke, Zack",5.666667,5,1,0,1,0
4,20220407_HOU@LAA,2022-04-07,HOU,0,starter,"Valdez, Framber",6.666667,2,1,0,6,0


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,K,HR
0,20230330_ATL@WSH,2023-03-30,ATL,0,starter,"Fried, Max",3.333333,4,0,0,2,0
1,20230330_ATL@WSH,2023-03-30,WSH,1,starter,"Corbin, Patrick",3.0,7,3,0,3,0
2,20230330_AZ@LAD,2023-03-30,AZ,0,starter,"Gallen, Zac",4.666667,6,3,0,7,0
3,20230330_AZ@LAD,2023-03-30,LAD,1,starter,"Urías, Julio",5.666667,4,0,1,6,0
4,20230330_BAL@BOS,2023-03-30,BAL,0,starter,"Gibson, Kyle",5.0,6,1,1,3,0


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,K,HR
0,20240320_LAD@SD,2024-03-20,LAD,0,starter,"Glasnow, Tyler",5.0,2,4,0,3,0
1,20240320_LAD@SD,2024-03-20,SD,1,starter,"Darvish, Yu",3.666667,2,3,0,3,0
2,20240321_SD@LAD,2024-03-21,LAD,1,starter,"Yamamoto, Yoshinobu",1.0,4,1,1,2,0
3,20240321_SD@LAD,2024-03-21,SD,0,starter,"Musgrove, Joe",2.666667,7,2,0,2,0
4,20240328_BOS@SEA,2024-03-28,BOS,0,starter,"Bello, Brayan",5.0,5,0,1,2,1


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,K,HR
0,20250318_LAD@CHC,2025-03-18,CHC,1,starter,"Imanaga, Shota",4.0,0,4,0,2,0
1,20250318_LAD@CHC,2025-03-18,LAD,0,starter,"Yamamoto, Yoshinobu",5.0,3,1,0,4,0
2,20250319_LAD@CHC,2025-03-19,CHC,1,starter,"Steele, Justin",4.0,5,1,0,5,2
3,20250319_LAD@CHC,2025-03-19,LAD,0,starter,"Sasaki, Roki",3.0,1,5,0,3,0
4,20250327_ATH@SEA,2025-03-27,ATH,0,starter,"Severino, Luis",6.0,3,4,1,6,0


 


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,HR


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,HR
0,20220407_CIN@ATL,2022-04-07,ATL,1,bullpen,3.333333,2,0,0,8,1
1,20220407_CIN@ATL,2022-04-07,CIN,0,bullpen,4.0,1,3,0,6,1
2,20220407_CLE@KC,2022-04-07,CLE,0,bullpen,3.0,2,2,0,2,0
3,20220407_CLE@KC,2022-04-07,KC,1,bullpen,3.333333,2,3,0,4,0
4,20220407_HOU@LAA,2022-04-07,HOU,0,bullpen,2.333333,2,0,1,1,0


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,HR
0,20230330_ATL@WSH,2023-03-30,ATL,0,bullpen,5.666667,4,4,0,3,0
1,20230330_ATL@WSH,2023-03-30,WSH,1,bullpen,6.0,5,3,0,4,0
2,20230330_AZ@LAD,2023-03-30,AZ,0,bullpen,3.0,6,2,0,5,1
3,20230330_AZ@LAD,2023-03-30,LAD,1,bullpen,3.0,0,0,0,2,0
4,20230330_BAL@BOS,2023-03-30,BAL,0,bullpen,4.0,5,2,1,6,0


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,HR
0,20240320_LAD@SD,2024-03-20,LAD,0,bullpen,4.0,2,0,0,3,0
1,20240320_LAD@SD,2024-03-20,SD,1,bullpen,5.333333,5,6,1,4,0
2,20240321_SD@LAD,2024-03-21,LAD,1,bullpen,8.0,14,5,1,6,1
3,20240321_SD@LAD,2024-03-21,SD,0,bullpen,6.333333,9,4,1,7,1
4,20240328_BOS@SEA,2024-03-28,BOS,0,bullpen,4.0,3,1,0,7,1


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,HR
0,20250318_LAD@CHC,2025-03-18,CHC,1,bullpen,5.0,7,4,0,7,0
1,20250318_LAD@CHC,2025-03-18,LAD,0,bullpen,4.0,0,0,1,5,0
2,20250319_LAD@CHC,2025-03-19,CHC,1,bullpen,5.0,2,6,0,4,1
3,20250319_LAD@CHC,2025-03-19,LAD,0,bullpen,6.0,7,2,0,9,0
4,20250327_ATH@SEA,2025-03-27,ATH,0,bullpen,1.666667,2,3,0,1,2


### Copying Dataframes

Below, I will make copies of `starter_lines_yyyy` and `bullpen_lines_yyyy`. These will be used for helping fill in missing data.

In [37]:
for y in range(2021, 2026):
    starters = globals()[f"starter_lines_{y}"].copy()
    bullpen  = globals()[f"bullpen_lines_{y}"].copy()

    # drop any columns that start with "roll"
    starters = starters.loc[:, ~starters.columns.str.startswith("roll")]
    bullpen  = bullpen.loc[:, ~bullpen.columns.str.startswith("roll")]

    globals()[f"starters_{y}"] = starters
    globals()[f"bullpen_{y}"] = bullpen

### Rolling Pitching Counts

Next, we compute rolling **3-day** and **7-day** totals for the pitching statistics needed to construct our rate metrics (IP, H, BB, HBP, K, HR). We calculate these rolling counts at different levels depending on pitcher role: for **starters**, we roll at the individual pitcher level, while for the **bullpen**, we roll at the team level. This produces time-based aggregates using only prior games, which are later used to compute rolling FIP, WHIP, K/9, and HR/9.


In [38]:
for year in range(2021, 2026):
    name = f"starter_lines_{year}"
    df = globals()[name].copy()

    if "pitcher_role" not in df.columns:
        df["pitcher_role"] = "starter"

    globals()[name] = add_rolling_pitching_counts(
        df,
        pitcher_col="pitcher_name"
    )

for year in range(2021, 2026):
    name = f"bullpen_lines_{year}"
    globals()[name] = add_rolling_pitching_counts(globals()[name])


  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)
  out = out.groupby(group_keys, group_keys=False).apply(_add_rolls).reset_index(drop=True)


In [39]:
display(HTML("<h4>Starting Pitcher Season 2021</h4>")); display(starter_lines_2021.head(5))
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(starter_lines_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(starter_lines_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(starter_lines_2024.head(5))
display(HTML("<h4>Starting Pitcher Season 2025</h4>")); display(starter_lines_2025.head(5))
print(" ")
display(HTML("<h4>Bullpen Pitcher Season 2021</h4>")); display(bullpen_lines_2021.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2022</h4>")); display(bullpen_lines_2022.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2023</h4>")); display(bullpen_lines_2023.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2024</h4>")); display(bullpen_lines_2024.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2025</h4>")); display(bullpen_lines_2025.head(5))


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,...,roll_3D_starter_BB,roll_3D_starter_HBP,roll_3D_starter_K,roll_3D_starter_HR,roll_7D_starter_IP,roll_7D_starter_H,roll_7D_starter_BB,roll_7D_starter_HBP,roll_7D_starter_K,roll_7D_starter_HR
0,20210815_BAL@BOS_Abad_Fernando,2021-08-15,BAL,0,starter,"Abad, Fernando",0.333333,1,2,0,...,,,,,,,,,,
1,20210816_BAL@TB_Abad_Fernando,2021-08-16,BAL,0,starter,"Abad, Fernando",2.0,3,0,0,...,2.0,0.0,1.0,0.0,0.333333,1.0,2.0,0.0,1.0,0.0
2,20210819_BAL@TB_Abad_Fernando,2021-08-19,BAL,0,starter,"Abad, Fernando",1.333333,2,2,0,...,2.0,0.0,2.0,1.0,2.333333,4.0,2.0,0.0,2.0,1.0
3,20210824_LAA@BAL_Abad_Fernando,2021-08-24,BAL,1,starter,"Abad, Fernando",3.333333,3,1,0,...,2.0,0.0,0.0,0.0,3.666667,6.0,4.0,0.0,2.0,1.0
4,20210828_TB@BAL_Abad_Fernando,2021-08-28,BAL,1,starter,"Abad, Fernando",1.0,0,0,0,...,1.0,0.0,3.0,0.0,4.666667,5.0,3.0,0.0,3.0,0.0


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,...,roll_3D_starter_BB,roll_3D_starter_HBP,roll_3D_starter_K,roll_3D_starter_HR,roll_7D_starter_IP,roll_7D_starter_H,roll_7D_starter_BB,roll_7D_starter_HBP,roll_7D_starter_K,roll_7D_starter_HR
0,20220802_NYM@WSH,2022-08-02,WSH,1,starter,"Abbott, Cory",5.0,2,2,1,...,,,,,,,,,,
1,20220807_WSH@PHI,2022-08-07,WSH,0,starter,"Abbott, Cory",3.333333,7,5,1,...,2.0,1.0,3.0,0.0,5.0,2.0,2.0,1.0,3.0,0.0
2,20220812_SD@WSH,2022-08-12,WSH,1,starter,"Abbott, Cory",4.0,4,3,0,...,5.0,1.0,2.0,4.0,8.333333,9.0,7.0,2.0,5.0,4.0
3,20220817_CHC@WSH,2022-08-17,WSH,1,starter,"Abbott, Cory",6.0,3,1,0,...,3.0,0.0,5.0,0.0,7.333333,11.0,8.0,1.0,7.0,4.0
4,20220907_WSH@STL,2022-09-07,WSH,0,starter,"Abbott, Cory",4.0,5,0,0,...,1.0,0.0,5.0,1.0,10.0,7.0,4.0,0.0,10.0,1.0


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,...,roll_3D_starter_BB,roll_3D_starter_HBP,roll_3D_starter_K,roll_3D_starter_HR,roll_7D_starter_IP,roll_7D_starter_H,roll_7D_starter_BB,roll_7D_starter_HBP,roll_7D_starter_K,roll_7D_starter_HR
0,20230605_MIL@CIN,2023-06-05,CIN,1,starter,"Abbott, Andrew",6.0,1,4,0,...,,,,,,,,,,
1,20230610_CIN@STL,2023-06-10,CIN,0,starter,"Abbott, Andrew",5.666667,5,3,0,...,4.0,0.0,6.0,0.0,6.0,1.0,4.0,0.0,6.0,0.0
2,20230616_CIN@HOU,2023-06-16,CIN,0,starter,"Abbott, Andrew",6.0,4,2,0,...,3.0,0.0,4.0,0.0,11.666667,6.0,7.0,0.0,10.0,0.0
3,20230621_COL@CIN,2023-06-21,CIN,1,starter,"Abbott, Andrew",6.0,4,0,0,...,2.0,0.0,2.0,0.0,11.666667,9.0,5.0,0.0,6.0,0.0
4,20230627_CIN@BAL,2023-06-27,CIN,0,starter,"Abbott, Andrew",6.0,2,3,0,...,0.0,0.0,10.0,3.0,12.0,8.0,2.0,0.0,12.0,3.0


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,...,roll_3D_starter_BB,roll_3D_starter_HBP,roll_3D_starter_K,roll_3D_starter_HR,roll_7D_starter_IP,roll_7D_starter_H,roll_7D_starter_BB,roll_7D_starter_HBP,roll_7D_starter_K,roll_7D_starter_HR
0,20240401_CIN@PHI,2024-04-01,CIN,0,starter,"Abbott, Andrew",5.333333,3,2,0,...,,,,,,,,,,
1,20240407_NYM@CIN,2024-04-07,CIN,1,starter,"Abbott, Andrew",5.0,7,2,1,...,2.0,0.0,4.0,0.0,5.333333,3.0,2.0,0.0,4.0,0.0
2,20240412_CIN@CWS,2024-04-12,CIN,0,starter,"Abbott, Andrew",7.0,4,0,0,...,2.0,1.0,4.0,1.0,10.333333,10.0,4.0,1.0,8.0,1.0
3,20240417_CIN@SEA,2024-04-17,CIN,0,starter,"Abbott, Andrew",6.0,4,3,0,...,0.0,0.0,3.0,0.0,12.0,11.0,2.0,1.0,7.0,1.0
4,20240423_PHI@CIN,2024-04-23,CIN,1,starter,"Abbott, Andrew",4.333333,2,4,0,...,3.0,0.0,6.0,2.0,13.0,8.0,3.0,0.0,9.0,2.0


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,...,roll_3D_starter_BB,roll_3D_starter_HBP,roll_3D_starter_K,roll_3D_starter_HR,roll_7D_starter_IP,roll_7D_starter_H,roll_7D_starter_BB,roll_7D_starter_HBP,roll_7D_starter_K,roll_7D_starter_HR
0,20250412_PIT@CIN,2025-04-12,CIN,1,starter,"Abbott, Andrew",5.0,2,2,0,...,,,,,,,,,,
1,20250418_CIN@BAL,2025-04-18,CIN,0,starter,"Abbott, Andrew",6.0,2,1,0,...,2.0,0.0,5.0,1.0,5.0,2.0,2.0,0.0,5.0,1.0
2,20250425_CIN@COL,2025-04-25,CIN,0,starter,"Abbott, Andrew",4.0,5,5,0,...,1.0,0.0,11.0,1.0,11.0,4.0,3.0,0.0,16.0,2.0
3,20250501_STL@CIN,2025-05-01,CIN,1,starter,"Abbott, Andrew",4.0,3,4,0,...,5.0,0.0,4.0,1.0,4.0,5.0,5.0,0.0,4.0,1.0
4,20250506_CIN@ATL,2025-05-06,CIN,0,starter,"Abbott, Andrew",5.0,4,0,0,...,4.0,0.0,3.0,0.0,8.0,8.0,9.0,0.0,7.0,1.0


 


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,HR


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,...,roll_3D_bullpen_BB,roll_3D_bullpen_HBP,roll_3D_bullpen_K,roll_3D_bullpen_HR,roll_7D_bullpen_IP,roll_7D_bullpen_H,roll_7D_bullpen_BB,roll_7D_bullpen_HBP,roll_7D_bullpen_K,roll_7D_bullpen_HR
0,20220408_ATH@PHI,2022-04-08,ATH,0,bullpen,2.666667,5,4,0,2,...,,,,,,,,,,
1,20220409_ATH@PHI,2022-04-09,ATH,0,bullpen,2.333333,0,3,0,3,...,4.0,0.0,2.0,0.0,2.666667,5.0,4.0,0.0,2.0,0.0
2,20220410_ATH@PHI,2022-04-10,ATH,0,bullpen,4.0,1,1,0,5,...,7.0,0.0,5.0,0.0,5.0,5.0,7.0,0.0,5.0,0.0
3,20220411_ATH@TB,2022-04-11,ATH,0,bullpen,4.0,5,0,0,6,...,8.0,0.0,10.0,1.0,9.0,6.0,8.0,0.0,10.0,1.0
4,20220412_ATH@TB,2022-04-12,ATH,0,bullpen,8.0,8,5,0,8,...,4.0,0.0,14.0,1.0,13.0,11.0,8.0,0.0,16.0,1.0


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,...,roll_3D_bullpen_BB,roll_3D_bullpen_HBP,roll_3D_bullpen_K,roll_3D_bullpen_HR,roll_7D_bullpen_IP,roll_7D_bullpen_H,roll_7D_bullpen_BB,roll_7D_bullpen_HBP,roll_7D_bullpen_K,roll_7D_bullpen_HR
0,20230330_LAA@ATH,2023-03-30,ATH,1,bullpen,4.0,1,3,0,5,...,,,,,,,,,,
1,20230401_LAA@ATH,2023-04-01,ATH,1,bullpen,6.666667,6,3,0,3,...,3.0,0.0,5.0,0.0,4.0,1.0,3.0,0.0,5.0,0.0
2,20230402_LAA@ATH,2023-04-02,ATH,1,bullpen,3.333333,2,1,0,2,...,6.0,0.0,8.0,1.0,10.666667,7.0,6.0,0.0,8.0,1.0
3,20230403_CLE@ATH,2023-04-03,ATH,1,bullpen,5.0,9,2,1,3,...,4.0,0.0,5.0,1.0,14.0,9.0,7.0,0.0,10.0,1.0
4,20230404_CLE@ATH,2023-04-04,ATH,1,bullpen,4.333333,0,2,0,5,...,6.0,1.0,8.0,1.0,19.0,18.0,9.0,1.0,13.0,1.0


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,...,roll_3D_bullpen_BB,roll_3D_bullpen_HBP,roll_3D_bullpen_K,roll_3D_bullpen_HR,roll_7D_bullpen_IP,roll_7D_bullpen_H,roll_7D_bullpen_BB,roll_7D_bullpen_HBP,roll_7D_bullpen_K,roll_7D_bullpen_HR
0,20240328_CLE@ATH,2024-03-28,ATH,1,bullpen,5.333333,4,2,2,4,...,,,,,,,,,,
1,20240329_CLE@ATH,2024-03-29,ATH,1,bullpen,3.666667,2,4,0,3,...,2.0,2.0,4.0,0.0,5.333333,4.0,2.0,2.0,4.0,0.0
2,20240330_CLE@ATH,2024-03-30,ATH,1,bullpen,5.333333,9,3,1,5,...,6.0,2.0,7.0,0.0,9.0,6.0,6.0,2.0,7.0,0.0
3,20240331_CLE@ATH,2024-03-31,ATH,1,bullpen,1.666667,4,1,1,2,...,9.0,3.0,12.0,1.0,14.333333,15.0,9.0,3.0,12.0,1.0
4,20240401_BOS@ATH,2024-04-01,ATH,1,bullpen,6.333333,1,3,0,7,...,8.0,2.0,10.0,1.0,16.0,19.0,10.0,4.0,14.0,1.0


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,...,roll_3D_bullpen_BB,roll_3D_bullpen_HBP,roll_3D_bullpen_K,roll_3D_bullpen_HR,roll_7D_bullpen_IP,roll_7D_bullpen_H,roll_7D_bullpen_BB,roll_7D_bullpen_HBP,roll_7D_bullpen_K,roll_7D_bullpen_HR
0,20250327_ATH@SEA,2025-03-27,ATH,0,bullpen,1.666667,2,3,0,1,...,,,,,,,,,,
1,20250328_ATH@SEA,2025-03-28,ATH,0,bullpen,3.0,2,2,0,4,...,3.0,0.0,1.0,2.0,1.666667,2.0,3.0,0.0,1.0,2.0
2,20250329_ATH@SEA,2025-03-29,ATH,0,bullpen,4.0,2,2,0,8,...,5.0,0.0,5.0,2.0,4.666667,4.0,5.0,0.0,5.0,2.0
3,20250330_ATH@SEA,2025-03-30,ATH,0,bullpen,1.0,0,0,0,2,...,7.0,0.0,13.0,2.0,8.666667,6.0,7.0,0.0,13.0,2.0
4,20250331_CHC@ATH,2025-03-31,ATH,1,bullpen,5.0,12,6,1,5,...,4.0,0.0,14.0,0.0,9.666667,6.0,7.0,0.0,15.0,2.0


### Rolling Pitching Rate Metrics

After computing rolling 3-day and 7-day **count** totals (IP, H, BB, HBP, K, HR), we can now convert these aggregates into rolling **rate-based** pitching metrics: **WHIP**, **K/9**, **HR/9**, and **FIP**. Importantly, these rates are calculated from the rolled sums (rather than rolling the ratios directly), and the logic applies to both **starting pitchers** (pitcher-level rolling) and the **bullpen** (team-level rolling).


**NOTE TO SELF**: Double check it is rolling correclty

In [42]:
for year in range(2021, 2026):
    name = f"starter_lines_{year}"
    globals()[name] = add_rate_metrics_from_rolled_counts(
        globals()[name],
        windows=("3D", "7D")
    )

for year in range(2021, 2026):
    name = f"bullpen_lines_{year}"
    globals()[name] = add_rate_metrics_from_rolled_counts(
        globals()[name],
        windows=("3D", "7D")
    )


In [43]:
display(HTML("<h4>Starting Pitcher Season 2021</h4>")); display(starter_lines_2021.head(5))
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(starter_lines_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(starter_lines_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(starter_lines_2024.head(5))
display(HTML("<h4>Starting Pitcher Season 2025</h4>")); display(starter_lines_2025.head(5))
print(" ")
display(HTML("<h4>Bullpen Pitcher Season 2021</h4>")); display(bullpen_lines_2021.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2022</h4>")); display(bullpen_lines_2022.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2023</h4>")); display(bullpen_lines_2023.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2024</h4>")); display(bullpen_lines_2024.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2025</h4>")); display(bullpen_lines_2025.head(5))


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,...,roll_7D_starter_K,roll_7D_starter_HR,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,20210815_BAL@BOS_Abad_Fernando,2021-08-15,BAL,0,starter,"Abad, Fernando",0.333333,1,2,0,...,,,,,,,,,,
1,20210816_BAL@TB_Abad_Fernando,2021-08-16,BAL,0,starter,"Abad, Fernando",2.0,3,0,0,...,1.0,0.0,9.0,27.0,0.0,12.0,9.0,27.0,0.0,12.0
2,20210819_BAL@TB_Abad_Fernando,2021-08-19,BAL,0,starter,"Abad, Fernando",1.333333,2,2,0,...,2.0,1.0,2.571429,7.714286,3.857143,6.428571,2.571429,7.714286,3.857143,6.428571
3,20210824_LAA@BAL_Abad_Fernando,2021-08-24,BAL,1,starter,"Abad, Fernando",3.333333,3,1,0,...,2.0,1.0,3.0,0.0,0.0,4.5,2.727273,4.909091,2.454545,5.727273
4,20210828_TB@BAL_Abad_Fernando,2021-08-28,BAL,1,starter,"Abad, Fernando",1.0,0,0,0,...,3.0,0.0,1.2,8.1,0.0,-0.9,1.714286,5.785714,0.0,0.642857


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,...,roll_7D_starter_K,roll_7D_starter_HR,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,20220802_NYM@WSH,2022-08-02,WSH,1,starter,"Abbott, Cory",5.0,2,2,1,...,,,,,,,,,,
1,20220807_WSH@PHI,2022-08-07,WSH,0,starter,"Abbott, Cory",3.333333,7,5,1,...,3.0,0.0,1.0,5.4,0.0,0.6,1.0,5.4,0.0,0.6
2,20220812_SD@WSH,2022-08-12,WSH,1,starter,"Abbott, Cory",4.0,4,3,0,...,5.0,4.0,3.9,5.4,10.8,19.8,2.16,5.4,4.32,8.28
3,20220817_CHC@WSH,2022-08-17,WSH,1,starter,"Abbott, Cory",6.0,3,1,0,...,7.0,4.0,1.75,11.25,0.0,-0.25,2.727273,8.590909,4.909091,8.863636
4,20220907_WSH@STL,2022-09-07,WSH,0,starter,"Abbott, Cory",4.0,5,0,0,...,10.0,1.0,0.666667,7.5,1.5,1.0,1.1,9.0,0.9,0.5


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,...,roll_7D_starter_K,roll_7D_starter_HR,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,20230605_MIL@CIN,2023-06-05,CIN,1,starter,"Abbott, Andrew",6.0,1,4,0,...,,,,,,,,,,
1,20230610_CIN@STL,2023-06-10,CIN,0,starter,"Abbott, Andrew",5.666667,5,3,0,...,6.0,0.0,0.833333,9.0,0.0,0.0,0.833333,9.0,0.0,0.0
2,20230616_CIN@HOU,2023-06-16,CIN,0,starter,"Abbott, Andrew",6.0,4,2,0,...,10.0,0.0,1.411765,6.352941,0.0,0.176471,1.114286,7.714286,0.0,0.085714
3,20230621_COL@CIN,2023-06-21,CIN,1,starter,"Abbott, Andrew",6.0,4,0,0,...,6.0,0.0,1.0,3.0,0.0,0.333333,1.2,4.628571,0.0,0.257143
4,20230627_CIN@BAL,2023-06-27,CIN,0,starter,"Abbott, Andrew",6.0,2,3,0,...,12.0,3.0,0.666667,15.0,4.5,3.166667,0.833333,9.0,2.25,1.75


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,...,roll_7D_starter_K,roll_7D_starter_HR,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,20240401_CIN@PHI,2024-04-01,CIN,0,starter,"Abbott, Andrew",5.333333,3,2,0,...,,,,,,,,,,
1,20240407_NYM@CIN,2024-04-07,CIN,1,starter,"Abbott, Andrew",5.0,7,2,1,...,4.0,0.0,0.9375,6.75,0.0,-0.375,0.9375,6.75,0.0,-0.375
2,20240412_CIN@CWS,2024-04-12,CIN,0,starter,"Abbott, Andrew",7.0,4,0,0,...,8.0,1.0,2.0,7.2,1.8,2.8,1.451613,6.967742,0.870968,1.16129
3,20240417_CIN@SEA,2024-04-17,CIN,0,starter,"Abbott, Andrew",6.0,4,3,0,...,7.0,1.0,0.571429,3.857143,0.0,-0.857143,1.166667,5.25,0.75,0.666667
4,20240423_PHI@CIN,2024-04-23,CIN,1,starter,"Abbott, Andrew",4.333333,2,4,0,...,9.0,2.0,1.166667,9.0,3.0,3.833333,0.846154,6.230769,1.384615,1.307692


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,...,roll_7D_starter_K,roll_7D_starter_HR,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,20250412_PIT@CIN,2025-04-12,CIN,1,starter,"Abbott, Andrew",5.0,2,2,0,...,,,,,,,,,,
1,20250418_CIN@BAL,2025-04-18,CIN,0,starter,"Abbott, Andrew",6.0,2,1,0,...,5.0,1.0,0.8,9.0,1.8,1.8,0.8,9.0,1.8,1.8
2,20250425_CIN@COL,2025-04-25,CIN,0,starter,"Abbott, Andrew",4.0,5,5,0,...,16.0,2.0,0.5,16.5,1.5,-1.0,0.636364,13.090909,1.636364,0.272727
3,20250501_STL@CIN,2025-05-01,CIN,1,starter,"Abbott, Andrew",4.0,3,4,0,...,4.0,1.0,2.5,9.0,2.25,5.0,2.5,9.0,2.25,5.0
4,20250506_CIN@ATL,2025-05-06,CIN,0,starter,"Abbott, Andrew",5.0,4,0,0,...,7.0,1.0,1.75,6.75,0.0,1.5,2.125,7.875,1.125,3.25


 


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,HR


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,...,roll_7D_bullpen_K,roll_7D_bullpen_HR,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,20220408_ATH@PHI,2022-04-08,ATH,0,bullpen,2.666667,5,4,0,2,...,,,,,,,,,,
1,20220409_ATH@PHI,2022-04-09,ATH,0,bullpen,2.333333,0,3,0,3,...,2.0,0.0,3.375,6.75,0.0,3.0,3.375,6.75,0.0,3.0
2,20220410_ATH@PHI,2022-04-10,ATH,0,bullpen,4.0,1,1,0,5,...,5.0,0.0,2.4,9.0,0.0,2.2,2.4,9.0,0.0,2.2
3,20220411_ATH@TB,2022-04-11,ATH,0,bullpen,4.0,5,0,0,6,...,10.0,1.0,1.555556,10.0,1.0,1.888889,1.555556,10.0,1.0,1.888889
4,20220412_ATH@TB,2022-04-12,ATH,0,bullpen,8.0,8,5,0,8,...,16.0,1.0,0.967742,12.193548,0.870968,-0.290323,1.461538,11.076923,0.692308,0.384615


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,...,roll_7D_bullpen_K,roll_7D_bullpen_HR,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,20230330_LAA@ATH,2023-03-30,ATH,1,bullpen,4.0,1,3,0,5,...,,,,,,,,,,
1,20230401_LAA@ATH,2023-04-01,ATH,1,bullpen,6.666667,6,3,0,3,...,5.0,0.0,1.0,11.25,0.0,-0.25,1.0,11.25,0.0,-0.25
2,20230402_LAA@ATH,2023-04-02,ATH,1,bullpen,3.333333,2,1,0,2,...,8.0,1.0,1.21875,6.75,0.84375,1.40625,1.21875,6.75,0.84375,1.40625
3,20230403_CLE@ATH,2023-04-03,ATH,1,bullpen,5.0,9,2,1,3,...,10.0,1.0,1.2,4.5,0.9,1.5,1.142857,6.428571,0.642857,1.0
4,20230404_CLE@ATH,2023-04-04,ATH,1,bullpen,4.333333,0,2,0,5,...,13.0,1.0,1.6,4.8,0.6,1.2,1.473684,6.157895,0.473684,0.894737


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,...,roll_7D_bullpen_K,roll_7D_bullpen_HR,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,20240328_CLE@ATH,2024-03-28,ATH,1,bullpen,5.333333,4,2,2,4,...,,,,,,,,,,
1,20240329_CLE@ATH,2024-03-29,ATH,1,bullpen,3.666667,2,4,0,3,...,4.0,0.0,1.5,6.75,0.0,0.75,1.5,6.75,0.0,0.75
2,20240330_CLE@ATH,2024-03-30,ATH,1,bullpen,5.333333,9,3,1,5,...,7.0,0.0,1.555556,7.0,0.0,1.111111,1.555556,7.0,0.0,1.111111
3,20240331_CLE@ATH,2024-03-31,ATH,1,bullpen,1.666667,4,1,1,2,...,12.0,1.0,1.883721,7.534884,0.627907,1.744186,1.883721,7.534884,0.627907,1.744186
4,20240401_BOS@ATH,2024-04-01,ATH,1,bullpen,6.333333,1,3,0,7,...,14.0,1.0,2.34375,8.4375,0.84375,2.15625,2.0625,7.875,0.5625,1.6875


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,...,roll_7D_bullpen_K,roll_7D_bullpen_HR,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,20250327_ATH@SEA,2025-03-27,ATH,0,bullpen,1.666667,2,3,0,1,...,,,,,,,,,,
1,20250328_ATH@SEA,2025-03-28,ATH,0,bullpen,3.0,2,2,0,4,...,1.0,2.0,3.0,5.4,10.8,19.8,3.0,5.4,10.8,19.8
2,20250329_ATH@SEA,2025-03-29,ATH,0,bullpen,4.0,2,2,0,8,...,5.0,2.0,1.928571,9.642857,3.857143,6.642857,1.928571,9.642857,3.857143,6.642857
3,20250330_ATH@SEA,2025-03-30,ATH,0,bullpen,1.0,0,0,0,2,...,13.0,2.0,1.5,13.5,2.076923,2.423077,1.5,13.5,2.076923,2.423077
4,20250331_CHC@ATH,2025-03-31,ATH,1,bullpen,5.0,12,6,1,5,...,15.0,2.0,1.0,15.75,0.0,-2.0,1.344828,13.965517,1.862069,1.758621


## Missing Data

Before combining the starting pitcher and bullpen dataframes for each season, we need to address the issue of missing data. Because all rolling features are computed using prior games only, the first game(s) of each season naturally contain missing values. These missing values are structural and arise from the absence of historical data within the rolling window.

To handle this, I impute missing values using information from the previous season only, avoiding any leakage from the current season.

**Starting Pitchers**
1. If the starting pitcher appeared in the previous season, I impute using that pitcher’s previous-season average.
2. If the pitcher did not appear in the previous season, I impute using the league-wide average across all starting pitchers.

**Bullpen**
- Bullpen features are imputed using the team’s previous-season bullpen average, reflecting bullpen performance as a team-level construct.



### Summarizing Pitcher Rates

- Explain waht this does

In [44]:
'''
Move to a script
'''

def summarize_pitching_rates(
    df: pd.DataFrame,
    kind: str = "starter",  # "starter" or "bullpen"
    pitcher_col: str = "pitcher_name",
    team_col: str = "pitching_team",
    ip_col: str = "IP",
    h_col: str = "H",
    bb_col: str = "BB",
    hbp_col: str = "HBP",
    k_col: str = "K",
    hr_col: str = "HR",
    include_hbp_in_whip: bool = True,
    fip_constant: float | None = None,  # None = raw FIP
    overall_label: str | None = None,   # optional custom label for the top row
) -> pd.DataFrame:
    """
    Aggregate pitching counts and compute WHIP, K/9, HR/9, FIP from totals.

    Output: ONE dataframe with the overall (innings-weighted, from totals) row at the top,
    followed by group rows:
      - kind="starter": group by pitcher_col
      - kind="bullpen": group by team_col

    Notes:
    - Rates are computed from aggregated totals (recommended), NOT averaged per-game or per-pitcher rates.
    - Groups with IP == 0 get NaN for rate stats.
    - No unweighted mean row is included.
    """
    kind = kind.lower().strip()
    if kind not in {"starter", "bullpen"}:
        raise ValueError('kind must be "starter" or "bullpen"')

    group_col = pitcher_col if kind == "starter" else team_col
    if group_col not in df.columns:
        raise ValueError(f"Expected grouping column '{group_col}' not found in df.")

    required = [ip_col, h_col, bb_col, hbp_col, k_col, hr_col]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    x = df.copy()
    for c in required:
        x[c] = pd.to_numeric(x[c], errors="coerce").fillna(0)

    # --- group totals ---
    by = (
        x.groupby(group_col, as_index=False)
         .agg(
             IP=(ip_col, "sum"),
             H=(h_col, "sum"),
             BB=(bb_col, "sum"),
             HBP=(hbp_col, "sum"),
             K=(k_col, "sum"),
             HR=(hr_col, "sum"),
         )
    )

    # --- compute rates from totals ---
    ip = by["IP"].to_numpy(dtype=float)
    h  = by["H"].to_numpy(dtype=float)
    bb = by["BB"].to_numpy(dtype=float)
    hbp= by["HBP"].to_numpy(dtype=float)
    k  = by["K"].to_numpy(dtype=float)
    hr = by["HR"].to_numpy(dtype=float)

    whip_num = h + bb + (hbp if include_hbp_in_whip else 0.0)

    with np.errstate(divide="ignore", invalid="ignore"):
        by["WHIP"] = np.where(ip > 0, whip_num / ip, np.nan)
        by["K9"]   = np.where(ip > 0, (k  * 9.0) / ip, np.nan)
        by["HR9"]  = np.where(ip > 0, (hr * 9.0) / ip, np.nan)

        fip_num = 13.0 * hr + 3.0 * (bb + hbp) - 2.0 * k
        fip = np.where(ip > 0, fip_num / ip, np.nan)
        if fip_constant is not None:
            fip = fip + float(fip_constant)
        by["FIP"] = fip

    # --- overall (innings-weighted) totals-based row ---
    totals = by[["IP", "H", "BB", "HBP", "K", "HR"]].sum(numeric_only=True)
    IPt, Ht, BBt, HBPt, Kt, HRt = [float(totals[c]) for c in ["IP", "H", "BB", "HBP", "K", "HR"]]

    if overall_label is None:
        overall_label = f"ALL_{kind.upper()}"

    whip_num_t = Ht + BBt + (HBPt if include_hbp_in_whip else 0.0)

    overall = {
        group_col: overall_label,
        "IP": IPt,
        "H": Ht,
        "BB": BBt,
        "HBP": HBPt,
        "K": Kt,
        "HR": HRt,
        "WHIP": (whip_num_t / IPt) if IPt > 0 else np.nan,
        "K9": (Kt * 9.0 / IPt) if IPt > 0 else np.nan,
        "HR9": (HRt * 9.0 / IPt) if IPt > 0 else np.nan,
        "FIP": ((13.0 * HRt + 3.0 * (BBt + HBPt) - 2.0 * Kt) / IPt + (float(fip_constant) if fip_constant is not None else 0.0))
               if IPt > 0 else np.nan,
        "mean_type": "weighted_by_IP (from totals)",
    }

    # sort groups (optional): most IP first
    by = by.sort_values(["IP", group_col], ascending=[False, True]).reset_index(drop=True)

    # final: overall row on top
    out = pd.concat([pd.DataFrame([overall]), by.assign(mean_type="group_totals")], ignore_index=True)

    # column order
    out = out[[group_col, "IP", "H", "BB", "HBP", "K", "HR", "WHIP", "K9", "HR9", "FIP", "mean_type"]]

    return out


In [45]:
for year in range(2021, 2026):
    globals()[f"starter_summary_{year}"] = summarize_pitching_rates(
        globals()[f"starters_{year}"],
        kind="starter",
    )

    globals()[f"bullpen_summary_{year}"] = summarize_pitching_rates(
        globals()[f"bullpen_{year}"],
        kind="bullpen",
    )

In [46]:
display(HTML("<h4>Starting Pitcher Season 2021</h4>")); display(starter_summary_2021.head(5))
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(starter_summary_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(starter_summary_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(starter_summary_2024.head(5))
display(HTML("<h4>Starting Pitcher Season 2025</h4>")); display(starter_summary_2025.head(5))
print(" ")
display(HTML("<h4>Bullpen Pitcher Season 2021</h4>")); display(bullpen_summary_2021.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2022</h4>")); display(bullpen_summary_2022.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2023</h4>")); display(bullpen_summary_2023.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2024</h4>")); display(bullpen_summary_2024.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2025</h4>")); display(bullpen_summary_2025.head(5))


Unnamed: 0,pitcher_name,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_STARTER,42183.666667,39484.0,15794.0,2112.0,42145.0,5944.0,1.360479,8.99175,1.268169,1.107064,weighted_by_IP (from totals)
1,"Wheeler, Zack",212.666667,169.0,46.0,8.0,247.0,16.0,1.048589,10.452978,0.677116,-0.583072,group_totals
2,"Buehler, Walker",207.0,149.0,52.0,6.0,212.0,19.0,1.0,9.217391,0.826087,-0.014493,group_totals
3,"Wainwright, Adam",204.666667,168.0,50.0,9.0,174.0,21.0,1.109121,7.651466,0.923453,0.498371,group_totals
4,"Alcantara, Sandy",203.0,171.0,50.0,10.0,201.0,21.0,1.137931,8.91133,0.931034,0.251232,group_totals


Unnamed: 0,pitcher_name,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_STARTER,25068.666667,23974.0,7978.0,1098.0,23046.0,3273.0,1.318379,8.273835,1.175053,0.944805,weighted_by_IP (from totals)
1,"Alcantara, Sandy",224.666667,174.0,50.0,9.0,207.0,16.0,1.037092,8.292285,0.64095,-0.12908,group_totals
2,"Nola, Aaron",202.666667,168.0,29.0,9.0,235.0,19.0,1.016447,10.435855,0.84375,-0.537829,group_totals
3,"Burnes, Corbin",200.666667,144.0,51.0,13.0,243.0,23.0,1.036545,10.898671,1.031561,0.024917,group_totals
4,"Valdez, Framber",199.666667,166.0,67.0,11.0,194.0,11.0,1.222037,8.744574,0.495826,-0.055092,group_totals


Unnamed: 0,pitcher_name,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_STARTER,24702.666667,24324.0,8409.0,1069.0,23538.0,3682.0,1.368354,8.575673,1.341475,1.18303,weighted_by_IP (from totals)
1,"Webb, Logan",213.0,201.0,31.0,5.0,194.0,20.0,1.112676,8.197183,0.84507,-0.093897,group_totals
2,"Gallen, Zac",209.0,188.0,47.0,5.0,220.0,22.0,1.148325,9.473684,0.947368,0.009569,group_totals
3,"Cole, Gerrit",206.0,157.0,48.0,7.0,222.0,20.0,1.029126,9.699029,0.873786,-0.092233,group_totals
4,"Mikolas, Miles",198.0,226.0,39.0,8.0,137.0,26.0,1.378788,6.227273,1.181818,1.035354,group_totals


Unnamed: 0,pitcher_name,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_STARTER,25044.666667,23976.0,8146.0,1068.0,23512.0,3421.0,1.325232,8.449224,1.229364,1.00185,weighted_by_IP (from totals)
1,"Gilbert, Logan",207.666667,148.0,37.0,4.0,220.0,26.0,0.910112,9.53451,1.126806,0.101124,group_totals
2,"Lugo, Seth",205.0,177.0,48.0,9.0,181.0,16.0,1.141463,7.946341,0.702439,0.082927,group_totals
3,"Webb, Logan",202.0,202.0,50.0,2.0,172.0,11.0,1.257426,7.663366,0.490099,-0.222772,group_totals
4,"Wheeler, Zack",198.666667,139.0,52.0,8.0,224.0,20.0,1.001678,10.147651,0.90604,-0.040268,group_totals


Unnamed: 0,pitcher_name,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_STARTER,24902.0,23841.0,8309.0,1020.0,23258.0,3512.0,1.332022,8.405831,1.269296,1.08935,weighted_by_IP (from totals)
1,"Webb, Logan",204.0,210.0,46.0,6.0,224.0,14.0,1.284314,9.882353,0.617647,-0.539216,group_totals
2,"Crochet, Garrett",202.0,165.0,46.0,3.0,255.0,24.0,1.059406,11.361386,1.069307,-0.252475,group_totals
3,"Sánchez, Cristopher",200.333333,171.0,44.0,6.0,212.0,12.0,1.103161,9.524126,0.539101,-0.589018,group_totals
4,"Rodón, Carlos",194.0,132.0,73.0,9.0,203.0,22.0,1.103093,9.417526,1.020619,0.649485,group_totals


 


Unnamed: 0,pitching_team,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_BULLPEN,0.0,0.0,0.0,0.0,0.0,0.0,,,,,weighted_by_IP (from totals)


Unnamed: 0,pitching_team,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_BULLPEN,17563.0,15714.0,6880.0,948.0,17795.0,1943.0,1.340432,9.118886,0.995673,0.748904,weighted_by_IP (from totals)
1,TB,675.666667,579.0,210.0,37.0,656.0,82.0,1.222496,8.738037,1.092255,0.73261,group_totals
2,CHC,654.0,589.0,274.0,41.0,719.0,98.0,1.382263,9.894495,1.348624,1.19419,group_totals
3,MIN,645.333333,593.0,241.0,30.0,673.0,78.0,1.338843,9.385847,1.08781,0.745351,group_totals
4,SF,643.333333,653.0,241.0,21.0,582.0,65.0,1.42228,8.141969,0.909326,0.725907,group_totals


Unnamed: 0,pitching_team,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_BULLPEN,17900.0,16527.0,7415.0,1045.0,18317.0,2187.0,1.395922,9.209665,1.099609,0.959609,weighted_by_IP (from totals)
1,SF,695.333333,669.0,232.0,35.0,700.0,75.0,1.346117,9.060403,0.970757,0.540748,group_totals
2,ATH,673.0,663.0,368.0,48.0,630.0,82.0,1.603269,8.424963,1.096582,1.566122,group_totals
3,TB,652.0,546.0,230.0,45.0,656.0,76.0,1.259202,9.055215,1.04908,0.768405,group_totals
4,BOS,651.666667,666.0,257.0,41.0,642.0,78.0,1.479284,8.866496,1.077238,0.957545,group_totals


Unnamed: 0,pitching_team,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_BULLPEN,17562.333333,15866.0,6787.0,952.0,17699.0,2035.0,1.34407,9.070036,1.042857,0.812762,weighted_by_IP (from totals)
1,DET,686.666667,573.0,209.0,33.0,622.0,75.0,1.186893,8.152427,0.98301,0.665534,group_totals
2,SF,647.666667,615.0,232.0,34.0,658.0,68.0,1.360268,9.143592,0.944931,0.565106,group_totals
3,MIA,645.333333,585.0,242.0,38.0,655.0,58.0,1.340393,9.134814,0.808884,0.440083,group_totals
4,MIL,644.333333,532.0,222.0,24.0,638.0,72.0,1.20745,8.911536,1.005691,0.617693,group_totals


Unnamed: 0,pitching_team,IP,H,BB,HBP,K,HR,WHIP,K9,HR9,FIP,mean_type
0,ALL_BULLPEN,17648.333333,16300.0,7073.0,908.0,17390.0,2138.0,1.375824,8.86826,1.090301,0.960827,weighted_by_IP (from totals)
1,LAD,649.333333,608.0,270.0,38.0,687.0,81.0,1.410678,9.522074,1.12269,0.928645,group_totals
2,CWS,648.0,607.0,294.0,32.0,645.0,78.0,1.439815,8.958333,1.083333,1.083333,group_totals
3,MIL,626.666667,543.0,244.0,29.0,620.0,61.0,1.302128,8.904255,0.876064,0.593617,group_totals
4,NYM,626.0,587.0,228.0,36.0,621.0,64.0,1.359425,8.928115,0.920128,0.610224,group_totals


### Imputation

In [47]:
'''
Move to Script
'''

def impute_pitching_roll_rates_from_prev_season(
    season_df: pd.DataFrame,
    prev_summary_df: pd.DataFrame,
    kind: str = "starter",  # "starter" or "bullpen"
    windows: tuple[str, ...] = ("3D", "7D"),
    metrics: tuple[str, ...] = ("WHIP", "K9", "HR9", "FIP"),
    pitcher_col: str = "pitcher_name",
    team_col: str = "pitching_team",
    roll_prefix: str = "roll_",
    starter_tag: str = "starter",
    bullpen_tag: str = "bullpen",
) -> pd.DataFrame:
    """
    Impute missing rolling RATE features in season t using season t-1 summary values.

    Starter case:
      - Expected columns: roll_{w}_starter_{metric}
      - Key: pitcher_name
      - Fallback: if pitcher not in prev summary OR prev value is NaN -> league mean row (ALL_STARTER)

    Bullpen case:
      - Expected columns: roll_{w}_bullpen_{metric}
      - Key: pitching_team
      - Fallback: if team not in prev summary -> league mean row (ALL_BULLPEN)

    Returns a copy of season_df with NaNs filled. (No extra indicator columns.)
    """
    kind = kind.lower().strip()
    if kind not in {"starter", "bullpen"}:
        raise ValueError('kind must be "starter" or "bullpen"')

    out = season_df.copy()

    if kind == "starter":
        group_col = pitcher_col
        overall_label = "ALL_STARTER"
        col_template = f"{roll_prefix}{{w}}_{starter_tag}_{{m}}"
    else:
        group_col = team_col
        overall_label = "ALL_BULLPEN"
        col_template = f"{roll_prefix}{{w}}_{bullpen_tag}_{{m}}"

    if group_col not in out.columns:
        raise ValueError(f"season_df missing grouping column '{group_col}'")

    if group_col not in prev_summary_df.columns:
        raise ValueError(f"prev_summary_df missing grouping column '{group_col}'")

    for m in metrics:
        if m not in prev_summary_df.columns:
            raise ValueError(f"prev_summary_df missing metric column '{m}'")

    prev = prev_summary_df[[group_col, *metrics]].copy()
    prev_map = prev.set_index(group_col)

    if overall_label not in prev_map.index:
        raise ValueError(
            f"prev_summary_df must contain overall row '{overall_label}' in column '{group_col}'."
        )

    league_vals = prev_map.loc[overall_label, list(metrics)].to_dict()

    def _prev_or_league(entity: str, metric: str) -> float:
        if entity in prev_map.index:
            v = prev_map.at[entity, metric]
            if pd.notna(v):
                return float(v)

        lv = league_vals.get(metric, np.nan)
        return float(lv) if pd.notna(lv) else np.nan

    roll_cols = []
    for w in windows:
        for m in metrics:
            c = col_template.format(w=w, m=m)
            if c in out.columns:
                roll_cols.append((w, m, c))

    if not roll_cols:
        example = col_template.format(w="3D", m=metrics[0])
        raise ValueError(
            f"No matching roll columns found in season_df. Example expected: '{example}'."
        )

    entities = out[group_col].astype("string").fillna("")

    for _, m, c in roll_cols:
        miss = out[c].isna()
        if miss.any():
            out.loc[miss, c] = entities.loc[miss].map(lambda e: _prev_or_league(str(e), m)).to_numpy()

    return out


In [48]:
for year in range(2022, 2026):  # 2023, 2024, 2025
    prev_year = year - 1

    # starters
    globals()[f"starter_lines_{year}"] = impute_pitching_roll_rates_from_prev_season(
        season_df=globals()[f"starter_lines_{year}"],
        prev_summary_df=globals()[f"starter_summary_{prev_year}"],
        kind="starter",
    )

    # bullpen
    globals()[f"bullpen_lines_{year}"] = impute_pitching_roll_rates_from_prev_season(
        season_df=globals()[f"bullpen_lines_{year}"],
        prev_summary_df=globals()[f"bullpen_summary_{prev_year}"],
        kind="bullpen",
    )

In [49]:
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(starter_lines_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(starter_lines_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(starter_lines_2024.head(5))
display(HTML("<h4>Starting Pitcher Season 2025</h4>")); display(starter_lines_2025.head(5))
print(" ")
display(HTML("<h4>Bullpen Pitcher Season 2022</h4>")); display(bullpen_lines_2022.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2023</h4>")); display(bullpen_lines_2023.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2024</h4>")); display(bullpen_lines_2024.head(5))
display(HTML("<h4>Bullpen Pitcher Season 2025</h4>")); display(bullpen_lines_2025.head(5))


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,...,roll_7D_starter_K,roll_7D_starter_HR,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,20220802_NYM@WSH,2022-08-02,WSH,1,starter,"Abbott, Cory",5.0,2,2,1,...,,,1.788462,6.230769,3.634615,5.769231,1.788462,6.230769,3.634615,5.769231
1,20220807_WSH@PHI,2022-08-07,WSH,0,starter,"Abbott, Cory",3.333333,7,5,1,...,3.0,0.0,1.0,5.4,0.0,0.6,1.0,5.4,0.0,0.6
2,20220812_SD@WSH,2022-08-12,WSH,1,starter,"Abbott, Cory",4.0,4,3,0,...,5.0,4.0,3.9,5.4,10.8,19.8,2.16,5.4,4.32,8.28
3,20220817_CHC@WSH,2022-08-17,WSH,1,starter,"Abbott, Cory",6.0,3,1,0,...,7.0,4.0,1.75,11.25,0.0,-0.25,2.727273,8.590909,4.909091,8.863636
4,20220907_WSH@STL,2022-09-07,WSH,0,starter,"Abbott, Cory",4.0,5,0,0,...,10.0,1.0,0.666667,7.5,1.5,1.0,1.1,9.0,0.9,0.5


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,...,roll_7D_starter_K,roll_7D_starter_HR,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,20230605_MIL@CIN,2023-06-05,CIN,1,starter,"Abbott, Andrew",6.0,1,4,0,...,,,1.318379,8.273835,1.175053,0.944805,1.318379,8.273835,1.175053,0.944805
1,20230610_CIN@STL,2023-06-10,CIN,0,starter,"Abbott, Andrew",5.666667,5,3,0,...,6.0,0.0,0.833333,9.0,0.0,0.0,0.833333,9.0,0.0,0.0
2,20230616_CIN@HOU,2023-06-16,CIN,0,starter,"Abbott, Andrew",6.0,4,2,0,...,10.0,0.0,1.411765,6.352941,0.0,0.176471,1.114286,7.714286,0.0,0.085714
3,20230621_COL@CIN,2023-06-21,CIN,1,starter,"Abbott, Andrew",6.0,4,0,0,...,6.0,0.0,1.0,3.0,0.0,0.333333,1.2,4.628571,0.0,0.257143
4,20230627_CIN@BAL,2023-06-27,CIN,0,starter,"Abbott, Andrew",6.0,2,3,0,...,12.0,3.0,0.666667,15.0,4.5,3.166667,0.833333,9.0,2.25,1.75


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,...,roll_7D_starter_K,roll_7D_starter_HR,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,20240401_CIN@PHI,2024-04-01,CIN,0,starter,"Abbott, Andrew",5.333333,3,2,0,...,,,1.359375,10.125,1.35,0.965625,1.359375,10.125,1.35,0.965625
1,20240407_NYM@CIN,2024-04-07,CIN,1,starter,"Abbott, Andrew",5.0,7,2,1,...,4.0,0.0,0.9375,6.75,0.0,-0.375,0.9375,6.75,0.0,-0.375
2,20240412_CIN@CWS,2024-04-12,CIN,0,starter,"Abbott, Andrew",7.0,4,0,0,...,8.0,1.0,2.0,7.2,1.8,2.8,1.451613,6.967742,0.870968,1.16129
3,20240417_CIN@SEA,2024-04-17,CIN,0,starter,"Abbott, Andrew",6.0,4,3,0,...,7.0,1.0,0.571429,3.857143,0.0,-0.857143,1.166667,5.25,0.75,0.666667
4,20240423_PHI@CIN,2024-04-23,CIN,1,starter,"Abbott, Andrew",4.333333,2,4,0,...,9.0,2.0,1.166667,9.0,3.0,3.833333,0.846154,6.230769,1.384615,1.307692


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,pitcher_name,IP,H,BB,HBP,...,roll_7D_starter_K,roll_7D_starter_HR,roll_3D_starter_WHIP,roll_3D_starter_K9,roll_3D_starter_HR9,roll_3D_starter_FIP,roll_7D_starter_WHIP,roll_7D_starter_K9,roll_7D_starter_HR9,roll_7D_starter_FIP
0,20250412_PIT@CIN,2025-04-12,CIN,1,starter,"Abbott, Andrew",5.0,2,2,0,...,,,1.32439,7.507317,1.646341,1.895122,1.32439,7.507317,1.646341,1.895122
1,20250418_CIN@BAL,2025-04-18,CIN,0,starter,"Abbott, Andrew",6.0,2,1,0,...,5.0,1.0,0.8,9.0,1.8,1.8,0.8,9.0,1.8,1.8
2,20250425_CIN@COL,2025-04-25,CIN,0,starter,"Abbott, Andrew",4.0,5,5,0,...,16.0,2.0,0.5,16.5,1.5,-1.0,0.636364,13.090909,1.636364,0.272727
3,20250501_STL@CIN,2025-05-01,CIN,1,starter,"Abbott, Andrew",4.0,3,4,0,...,4.0,1.0,2.5,9.0,2.25,5.0,2.5,9.0,2.25,5.0
4,20250506_CIN@ATL,2025-05-06,CIN,0,starter,"Abbott, Andrew",5.0,4,0,0,...,7.0,1.0,1.75,6.75,0.0,1.5,2.125,7.875,1.125,3.25


 


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,...,roll_7D_bullpen_K,roll_7D_bullpen_HR,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,20220408_ATH@PHI,2022-04-08,ATH,0,bullpen,2.666667,5,4,0,2,...,,,,,,,,,,
1,20220409_ATH@PHI,2022-04-09,ATH,0,bullpen,2.333333,0,3,0,3,...,2.0,0.0,3.375,6.75,0.0,3.0,3.375,6.75,0.0,3.0
2,20220410_ATH@PHI,2022-04-10,ATH,0,bullpen,4.0,1,1,0,5,...,5.0,0.0,2.4,9.0,0.0,2.2,2.4,9.0,0.0,2.2
3,20220411_ATH@TB,2022-04-11,ATH,0,bullpen,4.0,5,0,0,6,...,10.0,1.0,1.555556,10.0,1.0,1.888889,1.555556,10.0,1.0,1.888889
4,20220412_ATH@TB,2022-04-12,ATH,0,bullpen,8.0,8,5,0,8,...,16.0,1.0,0.967742,12.193548,0.870968,-0.290323,1.461538,11.076923,0.692308,0.384615


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,...,roll_7D_bullpen_K,roll_7D_bullpen_HR,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,20230330_LAA@ATH,2023-03-30,ATH,1,bullpen,4.0,1,3,0,5,...,,,1.439438,8.547104,1.042715,1.049737,1.439438,8.547104,1.042715,1.049737
1,20230401_LAA@ATH,2023-04-01,ATH,1,bullpen,6.666667,6,3,0,3,...,5.0,0.0,1.0,11.25,0.0,-0.25,1.0,11.25,0.0,-0.25
2,20230402_LAA@ATH,2023-04-02,ATH,1,bullpen,3.333333,2,1,0,2,...,8.0,1.0,1.21875,6.75,0.84375,1.40625,1.21875,6.75,0.84375,1.40625
3,20230403_CLE@ATH,2023-04-03,ATH,1,bullpen,5.0,9,2,1,3,...,10.0,1.0,1.2,4.5,0.9,1.5,1.142857,6.428571,0.642857,1.0
4,20230404_CLE@ATH,2023-04-04,ATH,1,bullpen,4.333333,0,2,0,5,...,13.0,1.0,1.6,4.8,0.6,1.2,1.473684,6.157895,0.473684,0.894737


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,...,roll_7D_bullpen_K,roll_7D_bullpen_HR,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,20240328_CLE@ATH,2024-03-28,ATH,1,bullpen,5.333333,4,2,2,4,...,,,1.603269,8.424963,1.096582,1.566122,1.603269,8.424963,1.096582,1.566122
1,20240329_CLE@ATH,2024-03-29,ATH,1,bullpen,3.666667,2,4,0,3,...,4.0,0.0,1.5,6.75,0.0,0.75,1.5,6.75,0.0,0.75
2,20240330_CLE@ATH,2024-03-30,ATH,1,bullpen,5.333333,9,3,1,5,...,7.0,0.0,1.555556,7.0,0.0,1.111111,1.555556,7.0,0.0,1.111111
3,20240331_CLE@ATH,2024-03-31,ATH,1,bullpen,1.666667,4,1,1,2,...,12.0,1.0,1.883721,7.534884,0.627907,1.744186,1.883721,7.534884,0.627907,1.744186
4,20240401_BOS@ATH,2024-04-01,ATH,1,bullpen,6.333333,1,3,0,7,...,14.0,1.0,2.34375,8.4375,0.84375,2.15625,2.0625,7.875,0.5625,1.6875


Unnamed: 0,game_id,game_date,pitching_team,is_home_team,pitcher_role,IP,H,BB,HBP,K,...,roll_7D_bullpen_K,roll_7D_bullpen_HR,roll_3D_bullpen_WHIP,roll_3D_bullpen_K9,roll_3D_bullpen_HR9,roll_3D_bullpen_FIP,roll_7D_bullpen_WHIP,roll_7D_bullpen_K9,roll_7D_bullpen_HR9,roll_7D_bullpen_FIP
0,20250327_ATH@SEA,2025-03-27,ATH,0,bullpen,1.666667,2,3,0,1,...,,,1.383833,9.234031,0.793669,0.656303,1.383833,9.234031,0.793669,0.656303
1,20250328_ATH@SEA,2025-03-28,ATH,0,bullpen,3.0,2,2,0,4,...,1.0,2.0,3.0,5.4,10.8,19.8,3.0,5.4,10.8,19.8
2,20250329_ATH@SEA,2025-03-29,ATH,0,bullpen,4.0,2,2,0,8,...,5.0,2.0,1.928571,9.642857,3.857143,6.642857,1.928571,9.642857,3.857143,6.642857
3,20250330_ATH@SEA,2025-03-30,ATH,0,bullpen,1.0,0,0,0,2,...,13.0,2.0,1.5,13.5,2.076923,2.423077,1.5,13.5,2.076923,2.423077
4,20250331_CHC@ATH,2025-03-31,ATH,1,bullpen,5.0,12,6,1,5,...,15.0,2.0,1.0,15.75,0.0,-2.0,1.344828,13.965517,1.862069,1.758621


### Combining Pitching Features at the Game Level

After computing rolling pitching metrics for **starting pitchers** (pitcher-level) and the **bullpen** (team-level), we can finally combine these features into a single **game-level** dataset keyed by `game_id`. This produces one row per game containing the rolling 3-day and 7-day metrics for both the home and away teams.

In addition, we retain the **home and away starting pitcher names** and append `_home` and `_away` suffixes to each feature to clearly indicate which side the metric corresponds to. This consolidated table is then used to compute home–away differences and to merge pitching features with our batting features.


In [50]:
for year in range(2022, 2026):
    globals()[f"game_pitching_rates_{year}"] = combine_game_level_pitching_rolling_rates(
        starter_df=globals()[f"starter_lines_{year}"],
        bullpen_df=globals()[f"bullpen_lines_{year}"],
        windows=("3D", "7D"),
        metrics=("WHIP", "K9", "HR9", "FIP"),
    )

In [51]:
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(game_pitching_rates_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(game_pitching_rates_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(game_pitching_rates_2024.head(5))
display(HTML("<h4>Starting Pitcher Season 2025</h4>")); display(game_pitching_rates_2025.head(5))


Unnamed: 0,game_id,game_date,starter_pitcher_name_home,roll_3D_starter_WHIP_home,roll_3D_starter_K9_home,roll_3D_starter_HR9_home,roll_3D_starter_FIP_home,roll_7D_starter_WHIP_home,roll_7D_starter_K9_home,roll_7D_starter_HR9_home,...,roll_7D_bullpen_HR9_bullpen_home,roll_7D_bullpen_FIP_bullpen_home,roll_3D_bullpen_WHIP_bullpen_away,roll_3D_bullpen_K9_bullpen_away,roll_3D_bullpen_HR9_bullpen_away,roll_3D_bullpen_FIP_bullpen_away,roll_7D_bullpen_WHIP_bullpen_away,roll_7D_bullpen_K9_bullpen_away,roll_7D_bullpen_HR9_bullpen_away,roll_7D_bullpen_FIP_bullpen_away
0,20220407_CIN@ATL,2022-04-07,"Fried, Max",1.14959,8.741803,0.829918,0.141393,1.14959,8.741803,0.829918,...,,,,,,,,,,
1,20220407_CLE@KC,2022-04-07,"Greinke, Zack",1.188235,6.352941,1.588235,1.552941,1.188235,6.352941,1.588235,...,,,,,,,,,,
2,20220407_HOU@LAA,2022-04-07,"Ohtani, Shohei",1.169231,10.8,1.038462,0.346154,1.169231,10.8,1.038462,...,,,,,,,,,,
3,20220407_MIL@CHC,2022-04-07,"Hendricks, Kyle",1.438433,6.598881,1.561567,1.746269,1.438433,6.598881,1.561567,...,,,,,,,,,,
4,20220407_NYM@WSH,2022-04-07,"Corbin, Patrick",1.502947,7.585462,1.962672,2.263261,1.502947,7.585462,1.962672,...,,,,,,,,,,


Unnamed: 0,game_id,game_date,starter_pitcher_name_home,roll_3D_starter_WHIP_home,roll_3D_starter_K9_home,roll_3D_starter_HR9_home,roll_3D_starter_FIP_home,roll_7D_starter_WHIP_home,roll_7D_starter_K9_home,roll_7D_starter_HR9_home,...,roll_7D_bullpen_HR9_bullpen_home,roll_7D_bullpen_FIP_bullpen_home,roll_3D_bullpen_WHIP_bullpen_away,roll_3D_bullpen_K9_bullpen_away,roll_3D_bullpen_HR9_bullpen_away,roll_3D_bullpen_FIP_bullpen_away,roll_7D_bullpen_WHIP_bullpen_away,roll_7D_bullpen_K9_bullpen_away,roll_7D_bullpen_HR9_bullpen_away,roll_7D_bullpen_FIP_bullpen_away
0,20230330_ATL@WSH,2023-03-30,"Corbin, Patrick",1.773333,7.68,1.62,1.753333,1.773333,7.68,1.62,...,1.17019,1.141649,1.195312,10.222356,0.859976,0.165865,1.195312,10.222356,0.859976,0.165865
1,20230330_AZ@LAD,2023-03-30,"Urías, Julio",0.996161,8.602687,1.191939,0.604607,0.996161,8.602687,1.191939,...,0.823834,0.131261,1.446237,7.806452,1.145161,1.258065,1.446237,7.806452,1.145161,1.258065
2,20230330_BAL@BOS,2023-03-30,"Kluber, Corey",1.292784,7.738144,1.113402,0.463918,1.292784,7.738144,1.113402,...,1.067136,1.039524,1.321814,8.601512,0.87473,0.532937,1.321814,8.601512,0.87473,0.532937
3,20230330_CLE@SEA,2023-03-30,"Castillo, Luis",1.147651,10.087248,0.785235,-0.040268,1.147651,10.087248,0.785235,...,1.10136,0.552534,1.160025,9.784558,0.874222,0.254047,1.160025,9.784558,0.874222,0.254047
4,20230330_COL@SD,2023-03-30,"Snell, Blake",1.231579,12.15,0.781579,-0.315789,1.231579,12.15,0.781579,...,0.886068,0.5387,1.43617,8.585106,0.989362,0.946809,1.43617,8.585106,0.989362,0.946809


Unnamed: 0,game_id,game_date,starter_pitcher_name_home,roll_3D_starter_WHIP_home,roll_3D_starter_K9_home,roll_3D_starter_HR9_home,roll_3D_starter_FIP_home,roll_7D_starter_WHIP_home,roll_7D_starter_K9_home,roll_7D_starter_HR9_home,...,roll_7D_bullpen_HR9_bullpen_home,roll_7D_bullpen_FIP_bullpen_home,roll_3D_bullpen_WHIP_bullpen_away,roll_3D_bullpen_K9_bullpen_away,roll_3D_bullpen_HR9_bullpen_away,roll_3D_bullpen_FIP_bullpen_away,roll_7D_bullpen_WHIP_bullpen_away,roll_7D_bullpen_K9_bullpen_away,roll_7D_bullpen_HR9_bullpen_away,roll_7D_bullpen_FIP_bullpen_away
0,20240320_LAD@SD,2024-03-20,"Darvish, Yu",1.369727,9.44665,1.205955,0.759305,1.369727,9.44665,1.205955,...,1.025117,0.977804,1.235662,9.347237,0.957247,0.478624,1.235662,9.347237,0.957247,0.478624
1,20240321_SD@LAD,2024-03-21,"Yamamoto, Yoshinobu",1.368354,8.575673,1.341475,1.18303,1.368354,8.575673,1.341475,...,0.0,-1.5,2.25,6.75,0.0,2.4375,2.25,6.75,0.0,2.4375
2,20240328_BOS@SEA,2024-03-28,"Castillo, Luis",1.133333,10.107692,1.292308,0.558974,1.133333,10.107692,1.292308,...,0.993865,0.546626,1.479284,8.866496,1.077238,0.957545,1.479284,8.866496,1.077238,0.957545
3,20240328_CHC@TEX,2024-03-28,"Eovaldi, Nathan",1.197183,8.366197,0.950704,0.633803,1.197183,8.366197,0.950704,...,1.365259,1.199286,1.364747,10.005236,1.005236,0.820244,1.364747,10.005236,1.005236,0.820244
4,20240328_CLE@ATH,2024-03-28,"Wood, Alex",1.757143,7.521429,1.35,2.271429,1.757143,7.521429,1.35,...,1.096582,1.566122,1.354951,9.145918,1.031847,0.766068,1.354951,9.145918,1.031847,0.766068


Unnamed: 0,game_id,game_date,starter_pitcher_name_home,roll_3D_starter_WHIP_home,roll_3D_starter_K9_home,roll_3D_starter_HR9_home,roll_3D_starter_FIP_home,roll_7D_starter_WHIP_home,roll_7D_starter_K9_home,roll_7D_starter_HR9_home,...,roll_7D_bullpen_HR9_bullpen_home,roll_7D_bullpen_FIP_bullpen_home,roll_3D_bullpen_WHIP_bullpen_away,roll_3D_bullpen_K9_bullpen_away,roll_3D_bullpen_HR9_bullpen_away,roll_3D_bullpen_FIP_bullpen_away,roll_7D_bullpen_WHIP_bullpen_away,roll_7D_bullpen_K9_bullpen_away,roll_7D_bullpen_HR9_bullpen_away,roll_7D_bullpen_FIP_bullpen_away
0,20250318_LAD@CHC,2025-03-18,"Imanaga, Shota",1.056751,9.193738,1.426614,0.563601,1.056751,9.193738,1.426614,...,1.009496,0.802967,1.259937,8.839958,1.101464,0.902197,1.259937,8.839958,1.101464,0.902197
1,20250319_LAD@CHC,2025-03-19,"Steele, Justin",1.134328,9.067164,0.80597,0.067164,1.134328,9.067164,0.80597,...,0.0,-0.4,0.25,11.25,0.0,-1.75,0.25,11.25,0.0,-1.75
2,20250327_ATH@SEA,2025-03-27,"Gilbert, Logan",0.910112,9.53451,1.126806,0.101124,0.910112,9.53451,1.126806,...,1.133517,0.749484,1.383833,9.234031,0.793669,0.656303,1.383833,9.234031,0.793669,0.656303
3,20250327_ATL@SD,2025-03-27,"King, Michael",1.232143,10.5,0.857143,0.10119,1.232143,10.5,0.857143,...,0.983549,0.430082,1.23221,9.994382,0.960674,0.2397,1.23221,9.994382,0.960674,0.2397
4,20250327_BAL@TOR,2025-03-27,"Berríos, José",1.241197,7.272887,1.473592,1.573944,1.241197,7.272887,1.473592,...,1.4821,1.696897,1.290865,9.265024,1.038462,0.722957,1.290865,9.265024,1.038462,0.722957


### Pitching Deltas (Home − Away)

After constructing game-level pitching features for both teams, we compute **home–away differences** for each rolling metric. These deltas summarize the relative pitching advantage in a single value per game (e.g., higher starter K/9 for the home team versus the away team).

The resulting dataset retains only the game identifiers and starting pitcher names, along with Δ features for rolling **starter** metrics (FIP, WHIP, K/9, HR/9) and rolling **bullpen FIP** for each window (3-day and 7-day). This format is convenient for downstream modeling and for merging with game-level batting features.


In [52]:
for year in range(2022, 2026):
    globals()[f"game_pitching_deltas_{year}"] = make_pitching_delta_df(
        globals()[f"game_pitching_rates_{year}"]
    )


In [53]:
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(game_pitching_deltas_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(game_pitching_deltas_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(game_pitching_deltas_2024.head(5))
display(HTML("<h4>Starting Pitcher Season 2025</h4>")); display(game_pitching_deltas_2025.head(5))


Unnamed: 0,game_id,game_date,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,Δbullpen_FIP_3D,Δstarter_FIP_7D,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D
0,20220407_CIN@ATL,2022-04-07,"Fried, Max","Mahle, Tyler",-0.500258,-0.156226,-1.896095,-0.385842,,-0.500258,-0.156226,-1.896095,-0.385842,
1,20220407_CLE@KC,2022-04-07,"Greinke, Zack","Bieber, Shane",1.69827,-0.06782,-6.16609,0.560554,,1.69827,-0.06782,-6.16609,0.560554,
2,20220407_HOU@LAA,2022-04-07,"Ohtani, Shohei","Valdez, Framber",-0.501346,-0.173269,2.3625,0.228462,,-0.501346,-0.173269,2.3625,0.228462,
3,20220407_MIL@CHC,2022-04-07,"Hendricks, Kyle","Burnes, Corbin",3.291359,0.458473,-6.062442,1.18281,,3.291359,0.458473,-6.062442,1.18281,
4,20220407_NYM@WSH,2022-04-07,"Corbin, Patrick","Megill, Tylor",0.740873,0.193245,-2.388419,0.048493,,0.740873,0.193245,-2.388419,0.048493,


Unnamed: 0,game_id,game_date,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,Δbullpen_FIP_3D,Δstarter_FIP_7D,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D
0,20230330_ATL@WSH,2023-03-30,"Corbin, Patrick","Fried, Max",2.167879,0.726061,-0.665455,1.030909,0.975784,2.167879,0.726061,-0.665455,1.030909,0.975784
1,20230330_AZ@LAD,2023-03-30,"Urías, Julio","Gallen, Zac",0.670662,0.005336,-0.909239,0.448819,-1.126804,0.670662,0.005336,-0.909239,0.448819,-1.126804
2,20230330_BAL@BOS,2023-03-30,"Kluber, Corey","Gibson, Kyle",-0.710781,-0.110831,-0.069085,-0.187803,0.506586,-0.710781,-0.110831,-0.069085,-0.187803,0.506586
3,20230330_CLE@SEA,2023-03-30,"Castillo, Luis","Bieber, Shane",0.202156,0.087045,1.087248,-0.032947,0.298487,0.202156,0.087045,1.087248,-0.032947,0.298487
4,20230330_COL@SD,2023-03-30,"Snell, Blake","Márquez, Germán",-1.935456,-0.171018,4.636085,-0.721204,-0.408109,-1.935456,-0.171018,4.636085,-0.721204,-0.408109


Unnamed: 0,game_id,game_date,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,Δbullpen_FIP_3D,Δstarter_FIP_7D,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D
0,20240320_LAD@SD,2024-03-20,"Darvish, Yu","Glasnow, Tyler",1.100972,0.27806,-2.70335,0.230955,0.49918,1.100972,0.27806,-2.70335,0.230955,0.49918
1,20240321_SD@LAD,2024-03-21,"Yamamoto, Yoshinobu","Musgrove, Joe",0.913134,0.12268,-0.48661,0.407219,-3.9375,0.913134,0.12268,-0.48661,0.407219,-3.9375
2,20240328_BOS@SEA,2024-03-28,"Castillo, Luis","Bello, Brayan",-0.743401,-0.266235,2.410068,-0.10726,-0.410919,-0.743401,-0.266235,2.410068,-0.10726,-0.410919
3,20240328_CHC@TEX,2024-03-28,"Eovaldi, Nathan","Steele, Justin",0.871255,-0.019033,-0.807548,0.220974,0.379042,0.871255,-0.019033,-0.807548,0.220974,0.379042
4,20240328_CLE@ATH,2024-03-28,"Wood, Alex","Bieber, Shane",1.642781,0.475976,-0.141701,0.347347,0.800054,1.642781,0.475976,-0.141701,0.347347,0.800054


Unnamed: 0,game_id,game_date,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,Δbullpen_FIP_3D,Δstarter_FIP_7D,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D
0,20250318_LAD@CHC,2025-03-18,"Imanaga, Shota","Yamamoto, Yoshinobu",1.123302,-0.073846,-1.38462,0.721391,-0.099229,1.123302,-0.073846,-1.38462,0.721391,-0.099229
1,20250319_LAD@CHC,2025-03-19,"Steele, Justin","Sasaki, Roki",-0.934686,-0.190904,0.61794,-0.423393,1.35,-0.934686,-0.190904,0.61794,-0.423393,1.35
2,20250327_ATH@SEA,2025-03-27,"Gilbert, Logan","Severino, Luis",-0.960329,-0.41391,1.439538,-0.029619,0.093181,-0.960329,-0.41391,1.439538,-0.029619,0.093181
3,20250327_ATL@SD,2025-03-27,"King, Michael","Sale, Chris",1.198333,0.157857,-1.071429,0.394286,0.190382,1.198333,0.157857,-1.071429,0.394286,0.190382
4,20250327_BAL@TOR,2025-03-27,"Berríos, José","Eflin, Zach",0.97151,0.066755,-0.065855,0.268723,0.973941,0.97151,0.066755,-0.065855,0.268723,0.973941


## Batter Metrics

Now, we can begin creating the features needed for modeling. We start by constructing pitching features based on the **starting pitcher**, including:


Now, we can calculate batter metrics. These are the team's

- On Base Percentage (OBP)

- Isolated Power (ISO)


$\text{OBP} = \frac{\text{H} + \text{BB} + \text{HBP}}
{\text{AB} + \text{BB} + \text{HBP} + \text{SF}}$

$\text{ISO} = \frac{2\text{B} + (2*3\text{B}) + (3*\text{HR})}{\text{AB}}
$


### Batting Indicators

Like pitchers, we create batting indicators that serve as building blocks for our batting feature calculations.


In [53]:
for y in range(2022, 2026):
    src_name = f"pa_{y}"
    dst_name = f"pa_batter_{y}"

    df = globals().get(src_name)
    if df is None:
        print(f"{src_name}: (not found)")
        continue

    globals()[dst_name] = add_batting_indicators(df.copy())
    print(f"{dst_name}: indicators added (from {src_name})")


pa_batter_2022: indicators added (from pa_2022)
pa_batter_2023: indicators added (from pa_2023)
pa_batter_2024: indicators added (from pa_2024)
pa_batter_2025: indicators added (from pa_2025)


In [54]:
display(HTML("<h4>Starting Pitcher Season 2022</h4>")); display(pa_batter_2022.head(5))
display(HTML("<h4>Starting Pitcher Season 2023</h4>")); display(pa_batter_2023.head(5))
display(HTML("<h4>Starting Pitcher Season 2024</h4>")); display(pa_batter_2024.head(5))
display(HTML("<h4>Starting Pitcher Season 2025</h4>")); display(pa_batter_2025.head(5))


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,is_starter,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr
2,2022-04-07,ATL,CIN,1,Top,3,0,0,0,strikeout,...,1,0,0,0,0,0,0,0,0,0
7,2022-04-07,ATL,CIN,1,Top,5,1,0,0,strikeout,...,1,0,0,0,0,0,0,0,0,0
10,2022-04-07,ATL,CIN,1,Top,3,2,0,0,field_out,...,1,0,0,0,0,0,0,0,0,0
15,2022-04-07,ATL,CIN,2,Top,5,0,0,0,strikeout,...,1,0,0,0,0,0,0,0,0,0
21,2022-04-07,ATL,CIN,2,Top,2,1,0,0,hit_by_pitch,...,1,0,1,0,0,0,0,0,0,0


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,is_starter,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr
1,2023-03-30,WSH,ATL,1,Bot,1,0,0,0,single,...,1,0,0,0,0,0,1,0,0,0
3,2023-03-30,WSH,ATL,1,Bot,3,0,0,0,grounded_into_double_play,...,1,0,0,0,0,0,0,0,0,0
7,2023-03-30,WSH,ATL,1,Bot,4,2,0,0,field_out,...,1,0,0,0,0,0,0,0,0,0
9,2023-03-30,WSH,ATL,2,Bot,1,0,0,3,double,...,1,0,0,0,0,0,0,1,0,0
13,2023-03-30,WSH,ATL,2,Bot,3,0,0,3,sac_fly,...,1,0,0,1,0,0,0,0,0,0


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,is_starter,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr
5,2024-03-20,SD,LAD,1,Bot,6,0,0,0,field_out,...,1,0,0,0,0,0,0,0,0,0
6,2024-03-20,SD,LAD,1,Bot,1,1,0,0,field_out,...,1,0,0,0,0,0,0,0,0,0
12,2024-03-20,SD,LAD,1,Bot,6,2,0,0,strikeout,...,1,0,0,0,0,0,0,0,0,0
13,2024-03-20,SD,LAD,2,Bot,1,0,0,0,field_out,...,1,0,0,0,0,0,0,0,0,0
18,2024-03-20,SD,LAD,2,Bot,5,1,0,0,field_out,...,1,0,0,0,0,0,0,0,0,0


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,is_starter,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr
2,2025-03-18,CHC,LAD,1,Top,3,0,0,0,field_out,...,1,0,0,0,0,0,0,0,0,0
5,2025-03-18,CHC,LAD,1,Top,3,1,0,0,field_out,...,1,0,0,0,0,0,0,0,0,0
8,2025-03-18,CHC,LAD,1,Top,3,2,0,0,strikeout,...,1,0,0,0,0,0,0,0,0,0
21,2025-03-18,CHC,LAD,2,Top,5,0,0,0,field_out,...,1,0,0,0,0,0,0,0,0,0
27,2025-03-18,CHC,LAD,2,Top,7,0,0,0,walk,...,1,1,0,0,0,0,0,0,0,0


### Split into away-batting and home-batting DataFrames

For convenience, we split each `pa_batter_yyyy` DataFrame into separate away- and home-batting DataFrames.


In [55]:
for y in range(2022, 2026):
    pa = globals().get(f"pa_batter_{y}")
    if pa is None:
        print(f"pa_batter_{y}: (not found)")
        continue

    away_df, home_df = split_batting_home_away(pa)
    globals()[f"away_batting_{y}"] = away_df
    globals()[f"home_batting_{y}"] = home_df

    print(f"away_batting_{y}={len(away_df):,}, home_batting_{y}={len(home_df):,}")



away_batting_2022=92,961, home_batting_2022=89,186
away_batting_2023=94,097, home_batting_2023=90,066
away_batting_2024=93,361, home_batting_2024=89,155
away_batting_2025=93,467, home_batting_2025=89,482


In [56]:
display(HTML("<h4>Home Season 2022</h4>")); display(home_batting_2022.head(5))
display(HTML("<h4>Home Season 2023</h4>")); display(home_batting_2023.head(5))
display(HTML("<h4>Home Season 2024</h4>")); display(home_batting_2024.head(5))
display(HTML("<h4>Home Season 2025</h4>")); display(home_batting_2025.head(5))
print(" ")
display(HTML("<h4>Away Season 2022</h4>")); display(away_batting_2022.head(5))
display(HTML("<h4>Away Season 2023</h4>")); display(away_batting_2023.head(5))
display(HTML("<h4>Away Season 2024</h4>")); display(away_batting_2024.head(5))
display(HTML("<h4>Away Season 2025</h4>")); display(away_batting_2025.head(5))

Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr,batting_team
144,2022-04-07,ATL,CIN,1,Bot,3,0,0,0,strikeout,...,0,0,0,0,0,0,0,0,0,ATL
147,2022-04-07,ATL,CIN,1,Bot,3,1,0,0,strikeout,...,0,0,0,0,0,0,0,0,0,ATL
150,2022-04-07,ATL,CIN,1,Bot,2,2,0,0,field_out,...,0,0,0,0,0,0,0,0,0,ATL
152,2022-04-07,ATL,CIN,1,Bot,3,2,0,0,single,...,0,0,0,0,0,1,0,0,0,ATL
156,2022-04-07,ATL,CIN,2,Bot,4,0,0,1,field_out,...,0,0,0,0,0,0,0,0,0,ATL


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr,batting_team
1,2023-03-30,WSH,ATL,1,Bot,1,0,0,0,single,...,0,0,0,0,0,1,0,0,0,WSH
3,2023-03-30,WSH,ATL,1,Bot,3,0,0,0,grounded_into_double_play,...,0,0,0,0,0,0,0,0,0,WSH
7,2023-03-30,WSH,ATL,1,Bot,4,2,0,0,field_out,...,0,0,0,0,0,0,0,0,0,WSH
9,2023-03-30,WSH,ATL,2,Bot,1,0,0,3,double,...,0,0,0,0,0,0,1,0,0,WSH
13,2023-03-30,WSH,ATL,2,Bot,3,0,0,3,sac_fly,...,0,0,1,0,0,0,0,0,0,WSH


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr,batting_team
5,2024-03-20,SD,LAD,1,Bot,6,0,0,0,field_out,...,0,0,0,0,0,0,0,0,0,SD
6,2024-03-20,SD,LAD,1,Bot,1,1,0,0,field_out,...,0,0,0,0,0,0,0,0,0,SD
12,2024-03-20,SD,LAD,1,Bot,6,2,0,0,strikeout,...,0,0,0,0,0,0,0,0,0,SD
13,2024-03-20,SD,LAD,2,Bot,1,0,0,0,field_out,...,0,0,0,0,0,0,0,0,0,SD
18,2024-03-20,SD,LAD,2,Bot,5,1,0,0,field_out,...,0,0,0,0,0,0,0,0,0,SD


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr,batting_team
189,2025-03-18,CHC,LAD,1,Bot,3,0,0,0,field_out,...,0,0,0,0,0,0,0,0,0,CHC
193,2025-03-18,CHC,LAD,1,Bot,6,0,0,0,walk,...,1,0,0,0,0,0,0,0,0,CHC
196,2025-03-18,CHC,LAD,1,Bot,3,1,0,0,field_out,...,0,0,0,0,0,0,0,0,0,CHC
200,2025-03-18,CHC,LAD,1,Bot,4,2,0,0,field_out,...,0,0,0,0,0,0,0,0,0,CHC
206,2025-03-18,CHC,LAD,2,Bot,6,0,0,0,field_out,...,0,0,0,0,0,0,0,0,0,CHC


 


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr,batting_team
2,2022-04-07,ATL,CIN,1,Top,3,0,0,0,strikeout,...,0,0,0,0,0,0,0,0,0,CIN
7,2022-04-07,ATL,CIN,1,Top,5,1,0,0,strikeout,...,0,0,0,0,0,0,0,0,0,CIN
10,2022-04-07,ATL,CIN,1,Top,3,2,0,0,field_out,...,0,0,0,0,0,0,0,0,0,CIN
15,2022-04-07,ATL,CIN,2,Top,5,0,0,0,strikeout,...,0,0,0,0,0,0,0,0,0,CIN
21,2022-04-07,ATL,CIN,2,Top,2,1,0,0,hit_by_pitch,...,0,1,0,0,0,0,0,0,0,CIN


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr,batting_team
151,2023-03-30,WSH,ATL,1,Top,4,0,0,0,single,...,0,0,0,0,0,1,0,0,0,ATL
155,2023-03-30,WSH,ATL,1,Top,8,0,0,0,strikeout,...,0,0,0,0,0,0,0,0,0,ATL
162,2023-03-30,WSH,ATL,1,Top,4,1,0,0,field_out,...,0,0,0,0,0,0,0,0,0,ATL
164,2023-03-30,WSH,ATL,1,Top,5,1,0,0,walk,...,1,0,0,0,0,0,0,0,0,ATL
168,2023-03-30,WSH,ATL,1,Top,4,2,0,0,strikeout,...,0,0,0,0,0,0,0,0,0,ATL


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr,batting_team
133,2024-03-20,SD,LAD,1,Top,4,0,0,0,force_out,...,0,0,0,0,0,0,0,0,0,LAD
135,2024-03-20,SD,LAD,1,Top,5,0,0,0,walk,...,1,0,0,0,0,0,0,0,0,LAD
139,2024-03-20,SD,LAD,1,Top,4,1,0,0,strikeout,...,0,0,0,0,0,0,0,0,0,LAD
141,2024-03-20,SD,LAD,1,Top,2,2,0,0,field_out,...,0,0,0,0,0,0,0,0,0,LAD
153,2024-03-20,SD,LAD,2,Top,6,0,0,0,single,...,0,0,0,0,0,1,0,0,0,LAD


Unnamed: 0,game_date,home_team,away_team,inning,inning_topbot,pitch_number,outs_when_up,home_score,away_score,events,...,is_bb,is_hbp,is_sf,is_sh,is_ci,is_1b,is_2b,is_3b,is_hr,batting_team
2,2025-03-18,CHC,LAD,1,Top,3,0,0,0,field_out,...,0,0,0,0,0,0,0,0,0,LAD
5,2025-03-18,CHC,LAD,1,Top,3,1,0,0,field_out,...,0,0,0,0,0,0,0,0,0,LAD
8,2025-03-18,CHC,LAD,1,Top,3,2,0,0,strikeout,...,0,0,0,0,0,0,0,0,0,LAD
21,2025-03-18,CHC,LAD,2,Top,5,0,0,0,field_out,...,0,0,0,0,0,0,0,0,0,LAD
27,2025-03-18,CHC,LAD,2,Top,7,0,0,0,walk,...,1,0,0,0,0,0,0,0,0,LAD


### Aggregate plate appearances to team-game batting totals

This function takes a plate-appearance–level batting DataFrame and aggregates it to one row per `(game_id, game_date, batting_team)`. It first derives game-level counting stats—hits (`H`), total bases (`TB`), and at-bats (`AB`)—from the PA indicators. In particular, `AB` excludes non–at-bat events (walks, hit-by-pitch, sacrifice flies, sacrifice bunts, catcher’s interference). It then groups by game and team to produce team-game totals such as `PA`, `AB`, `H`, `TB`, `BB`, `HBP`, `SF`, `SH`, `CI`, `HR`, and hit-type counts (`_1B`, `_2B`, `_3B`). These totals are intended as building blocks for downstream rate stats like OBP and ISO.


In [57]:
for y in range(2022, 2026):
    away_name = f"away_batting_{y}"
    home_name = f"home_batting_{y}"

    away_df = globals().get(away_name)
    home_df = globals().get(home_name)

    if away_df is None or home_df is None:
        print(f"{y}: missing {away_name} or {home_name}")
        continue

    globals()[away_name] = aggregate_team_game_batting(away_df)
    globals()[home_name] = aggregate_team_game_batting(home_df)

    print(f"{y}: updated {away_name} ({len(globals()[away_name]):,} rows), "
          f"{home_name} ({len(globals()[home_name]):,} rows)")


2022: updated away_batting_2022 (2,450 rows), home_batting_2022 (2,454 rows)
2023: updated away_batting_2023 (2,440 rows), home_batting_2023 (2,443 rows)
2024: updated away_batting_2024 (2,440 rows), home_batting_2024 (2,439 rows)
2025: updated away_batting_2025 (2,437 rows), home_batting_2025 (2,436 rows)


### Add time-based rolling batting sums by team

This function adds **rolling sums** of team batting totals over pre-defined time windows (default: **3 days** and **7 days**). For each team, the rolling totals are computed using **only prior games** (by shifting one game), which prevents information from the current game from entering its own features. The resulting columns are named like `roll_3D_H`, `roll_7D_AB`, and `roll_3D_TB`, and the output is returned sorted by team and game date.



In [58]:
for y in range(2022, 2026):
    away_name = f"away_batting_{y}"
    home_name = f"home_batting_{y}"

    globals()[away_name] = add_time_rolling_batting_sums(
        globals()[away_name],
        windows=("3D", "7D"),
        sum_cols=["AB", "H", "BB", "HBP", "SF", "HR", "_2B", "_3B"],
        min_periods=1,
    )

    globals()[home_name] = add_time_rolling_batting_sums(
        globals()[home_name],
        windows=("3D", "7D"),
        sum_cols=["AB", "H", "BB", "HBP", "SF", "HR", "_2B", "_3B"],
        min_periods=1,
    )


  out = df.groupby(team_col, group_keys=False, sort=False).apply(_apply)
  out = df.groupby(team_col, group_keys=False, sort=False).apply(_apply)
  out = df.groupby(team_col, group_keys=False, sort=False).apply(_apply)
  out = df.groupby(team_col, group_keys=False, sort=False).apply(_apply)
  out = df.groupby(team_col, group_keys=False, sort=False).apply(_apply)
  out = df.groupby(team_col, group_keys=False, sort=False).apply(_apply)
  out = df.groupby(team_col, group_keys=False, sort=False).apply(_apply)
  out = df.groupby(team_col, group_keys=False, sort=False).apply(_apply)


In [59]:
display(HTML("<h4>Home Season 2022</h4>")); display(home_batting_2022.head(5))
display(HTML("<h4>Home Season 2023</h4>")); display(home_batting_2023.head(5))
display(HTML("<h4>Home Season 2024</h4>")); display(home_batting_2024.head(5))
display(HTML("<h4>Home Season 2025</h4>")); display(home_batting_2025.head(5))
print(" ")
display(HTML("<h4>Away Season 2022</h4>")); display(away_batting_2022.head(5))
display(HTML("<h4>Away Season 2023</h4>")); display(away_batting_2023.head(5))
display(HTML("<h4>Away Season 2024</h4>")); display(away_batting_2024.head(5))
display(HTML("<h4>Away Season 2025</h4>")); display(away_batting_2025.head(5))

Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_3D__2B,roll_3D__3B,roll_7D_AB,roll_7D_H,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B
0,20220418_BAL@ATH,2022-04-18,ATH,34,31,5,6,3,0,0,...,,,,,,,,,,
1,20220419_BAL@ATH,2022-04-19,ATH,35,30,7,8,5,0,0,...,1.0,0.0,31.0,5.0,3.0,0.0,0.0,0.0,1.0,0.0
2,20220420_BAL@ATH,2022-04-20,ATH,35,33,7,8,1,1,0,...,2.0,0.0,61.0,12.0,8.0,0.0,0.0,0.0,2.0,0.0
3,20220421_BAL@ATH,2022-04-21,ATH,35,35,11,21,0,0,0,...,3.0,0.0,94.0,19.0,9.0,1.0,0.0,0.0,3.0,0.0
4,20220422_TEX@ATH,2022-04-22,ATH,32,29,2,4,3,0,0,...,6.0,0.0,129.0,30.0,9.0,1.0,0.0,2.0,7.0,0.0


Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_3D__2B,roll_3D__3B,roll_7D_AB,roll_7D_H,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B
0,20230330_LAA@ATH,2023-03-30,ATH,33,29,6,8,4,0,0,...,,,,,,,,,,
1,20230401_LAA@ATH,2023-04-01,ATH,34,30,5,8,3,0,0,...,2.0,0.0,29.0,6.0,4.0,0.0,0.0,0.0,2.0,0.0
2,20230402_LAA@ATH,2023-04-02,ATH,35,31,5,6,3,1,0,...,2.0,0.0,59.0,11.0,7.0,0.0,0.0,1.0,2.0,0.0
3,20230403_CLE@ATH,2023-04-03,ATH,46,43,14,26,3,0,0,...,3.0,0.0,90.0,16.0,10.0,1.0,0.0,1.0,3.0,0.0
4,20230404_CLE@ATH,2023-04-04,ATH,37,32,6,9,4,0,1,...,4.0,0.0,133.0,30.0,13.0,1.0,0.0,4.0,6.0,0.0


Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_3D__2B,roll_3D__3B,roll_7D_AB,roll_7D_H,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B
0,20240328_CLE@ATH,2024-03-28,ATH,31,30,4,5,1,0,0,...,,,,,,,,,,
1,20240329_CLE@ATH,2024-03-29,ATH,36,32,8,16,2,0,2,...,1.0,0.0,30.0,4.0,1.0,0.0,0.0,0.0,1.0,0.0
2,20240330_CLE@ATH,2024-03-30,ATH,40,32,6,7,6,2,0,...,3.0,0.0,62.0,12.0,3.0,0.0,2.0,2.0,3.0,0.0
3,20240331_CLE@ATH,2024-03-31,ATH,36,30,9,14,4,1,0,...,4.0,0.0,94.0,18.0,9.0,2.0,2.0,2.0,4.0,0.0
4,20240401_BOS@ATH,2024-04-01,ATH,31,31,4,5,0,0,0,...,4.0,2.0,124.0,27.0,13.0,3.0,2.0,2.0,5.0,2.0


Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_3D__2B,roll_3D__3B,roll_7D_AB,roll_7D_H,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B
0,20250331_CHC@ATH,2025-03-31,ATH,38,36,10,15,2,0,0,...,,,,,,,,,,
1,20250401_CHC@ATH,2025-04-01,ATH,35,33,8,14,2,0,0,...,2.0,0.0,36.0,10.0,2.0,0.0,0.0,1.0,2.0,0.0
2,20250402_CHC@ATH,2025-04-02,ATH,36,31,5,9,5,0,0,...,2.0,0.0,69.0,18.0,4.0,0.0,0.0,3.0,2.0,0.0
3,20250407_SD@ATH,2025-04-07,ATH,38,36,10,17,2,0,0,...,1.0,0.0,100.0,23.0,9.0,0.0,0.0,4.0,3.0,0.0
4,20250408_SD@ATH,2025-04-08,ATH,39,36,13,20,3,0,0,...,2.0,0.0,100.0,23.0,9.0,0.0,0.0,5.0,2.0,0.0


 


Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_3D__2B,roll_3D__3B,roll_7D_AB,roll_7D_H,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B
0,20220408_ATH@PHI,2022-04-08,ATH,35,34,6,13,0,1,0,...,,,,,,,,,,
1,20220409_ATH@PHI,2022-04-09,ATH,31,30,5,6,1,0,0,...,1.0,0.0,34.0,6.0,0.0,1.0,0.0,2.0,1.0,0.0
2,20220410_ATH@PHI,2022-04-10,ATH,39,35,9,14,3,1,0,...,2.0,0.0,64.0,11.0,1.0,1.0,0.0,2.0,2.0,0.0
3,20220411_ATH@TB,2022-04-11,ATH,45,41,13,29,3,1,0,...,4.0,0.0,99.0,20.0,4.0,2.0,0.0,3.0,4.0,0.0
4,20220412_ATH@TB,2022-04-12,ATH,45,35,8,12,9,1,0,...,5.0,1.0,140.0,33.0,7.0,3.0,0.0,7.0,6.0,1.0


Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_3D__2B,roll_3D__3B,roll_7D_AB,roll_7D_H,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B
0,20230407_ATH@TB,2023-04-07,ATH,38,36,12,20,1,1,0,...,,,,,,,,,,
1,20230408_ATH@TB,2023-04-08,ATH,32,28,3,3,3,0,0,...,2.0,0.0,36.0,12.0,1.0,1.0,0.0,2.0,2.0,0.0
2,20230409_ATH@TB,2023-04-09,ATH,29,28,1,2,1,0,0,...,2.0,0.0,64.0,15.0,4.0,1.0,0.0,2.0,2.0,0.0
3,20230410_ATH@BAL,2023-04-10,ATH,35,32,8,12,3,0,0,...,3.0,0.0,92.0,16.0,5.0,1.0,0.0,2.0,3.0,0.0
4,20230411_ATH@BAL,2023-04-11,ATH,43,39,12,16,4,0,0,...,3.0,1.0,124.0,24.0,8.0,1.0,0.0,2.0,5.0,1.0


Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_3D__2B,roll_3D__3B,roll_7D_AB,roll_7D_H,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B
0,20240405_ATH@DET,2024-04-05,ATH,37,32,5,13,5,0,0,...,,,,,,,,,,
1,20240406_ATH@DET,2024-04-06,ATH,37,31,5,8,6,0,0,...,2.0,0.0,32.0,5.0,5.0,0.0,0.0,2.0,2.0,0.0
2,20240407_ATH@DET,2024-04-07,ATH,42,39,14,20,2,1,0,...,2.0,0.0,63.0,10.0,11.0,0.0,0.0,3.0,2.0,0.0
3,20240409_ATH@TEX,2024-04-09,ATH,35,32,5,14,3,0,0,...,1.0,1.0,102.0,24.0,13.0,1.0,0.0,4.0,3.0,1.0
4,20240410_ATH@TEX,2024-04-10,ATH,38,37,9,13,1,0,0,...,1.0,1.0,134.0,29.0,16.0,1.0,0.0,7.0,3.0,1.0


Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_3D__2B,roll_3D__3B,roll_7D_AB,roll_7D_H,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B
0,20250327_ATH@SEA,2025-03-27,ATH,30,29,3,9,1,0,0,...,,,,,,,,,,
1,20250328_ATH@SEA,2025-03-28,ATH,44,38,12,23,5,1,0,...,0.0,0.0,29.0,3.0,1.0,0.0,0.0,2.0,0.0,0.0
2,20250329_ATH@SEA,2025-03-29,ATH,38,36,9,12,2,0,0,...,2.0,0.0,67.0,15.0,6.0,1.0,0.0,5.0,2.0,0.0
3,20250330_ATH@SEA,2025-03-30,ATH,34,30,4,8,3,0,0,...,2.0,0.0,103.0,24.0,8.0,1.0,0.0,6.0,2.0,0.0
4,20250404_ATH@COL,2025-04-04,ATH,45,42,11,16,3,0,0,...,1.0,0.0,104.0,25.0,10.0,1.0,0.0,5.0,3.0,0.0


### Add rolling OBP and ISO from rolling batting totals

This step computes **rolling OBP** and **rolling ISO** directly from the rolling sum columns that were created earlier (e.g., rolling `AB`, `H`, `BB`, `HBP`, `SF`, `HR`, `2B`, `3B`). For each window (default: `3D`, `7D`), it derives:

- `roll_{w}_OBP` using the rolling totals in the OBP formula (with a denominator check to avoid divide-by-zero)
- `roll_{w}_ISO` using rolling extra-base production divided by rolling at-bats (also guarded against divide-by-zero)


In [60]:
years = [2022, 2023, 2024, 2025]
away = {y: globals()[f"away_batting_{y}"] for y in years}
home = {y: globals()[f"home_batting_{y}"] for y in years}

away = add_rolling_obp_iso_batch(away)
home = add_rolling_obp_iso_batch(home)

# (optional) put back into your original variable names
for y in years:
    globals()[f"away_batting_{y}"] = away[y]
    globals()[f"home_batting_{y}"] = home[y]

In [61]:
display(HTML("<h4>Home Season 2022</h4>")); display(home_batting_2022.head(5))
display(HTML("<h4>Home Season 2023</h4>")); display(home_batting_2023.head(5))
display(HTML("<h4>Home Season 2024</h4>")); display(home_batting_2024.head(5))
display(HTML("<h4>Home Season 2025</h4>")); display(home_batting_2025.head(5))
print(" ")
display(HTML("<h4>Away Season 2022</h4>")); display(away_batting_2022.head(5))
display(HTML("<h4>Away Season 2023</h4>")); display(away_batting_2023.head(5))
display(HTML("<h4>Away Season 2024</h4>")); display(away_batting_2024.head(5))
display(HTML("<h4>Away Season 2025</h4>")); display(away_batting_2025.head(5))

Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,20220418_BAL@ATH,2022-04-18,ATH,34,31,5,6,3,0,0,...,,,,,,,,,,
1,20220419_BAL@ATH,2022-04-19,ATH,35,30,7,8,5,0,0,...,3.0,0.0,0.0,0.0,1.0,0.0,0.235294,0.032258,0.235294,0.032258
2,20220420_BAL@ATH,2022-04-20,ATH,35,33,7,8,1,1,0,...,8.0,0.0,0.0,0.0,2.0,0.0,0.289855,0.032787,0.289855,0.032787
3,20220421_BAL@ATH,2022-04-21,ATH,35,35,11,21,0,0,0,...,9.0,1.0,0.0,0.0,3.0,0.0,0.278846,0.031915,0.278846,0.031915
4,20220422_TEX@ATH,2022-04-22,ATH,32,29,2,4,3,0,0,...,9.0,1.0,0.0,2.0,7.0,0.0,0.304762,0.122449,0.28777,0.100775


Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,20230330_LAA@ATH,2023-03-30,ATH,33,29,6,8,4,0,0,...,,,,,,,,,,
1,20230401_LAA@ATH,2023-04-01,ATH,34,30,5,8,3,0,0,...,4.0,0.0,0.0,0.0,2.0,0.0,0.30303,0.068966,0.30303,0.068966
2,20230402_LAA@ATH,2023-04-02,ATH,35,31,5,6,3,1,0,...,7.0,0.0,0.0,1.0,2.0,0.0,0.272727,0.084746,0.272727,0.084746
3,20230403_CLE@ATH,2023-04-03,ATH,46,43,14,26,3,0,0,...,10.0,1.0,0.0,1.0,3.0,0.0,0.267327,0.066667,0.267327,0.066667
4,20230404_CLE@ATH,2023-04-04,ATH,37,32,6,9,4,0,1,...,13.0,1.0,0.0,4.0,6.0,0.0,0.298246,0.153846,0.29932,0.135338


Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,20240328_CLE@ATH,2024-03-28,ATH,31,30,4,5,1,0,0,...,,,,,,,,,,
1,20240329_CLE@ATH,2024-03-29,ATH,36,32,8,16,2,0,2,...,1.0,0.0,0.0,0.0,1.0,0.0,0.16129,0.033333,0.16129,0.033333
2,20240330_CLE@ATH,2024-03-30,ATH,40,32,6,7,6,2,0,...,3.0,0.0,2.0,2.0,3.0,0.0,0.223881,0.145161,0.223881,0.145161
3,20240331_CLE@ATH,2024-03-31,ATH,36,30,9,14,4,1,0,...,9.0,2.0,2.0,2.0,4.0,0.0,0.271028,0.106383,0.271028,0.106383
4,20240401_BOS@ATH,2024-04-01,ATH,31,31,4,5,0,0,0,...,13.0,3.0,2.0,2.0,5.0,2.0,0.342342,0.148936,0.302817,0.120968


Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,20250331_CHC@ATH,2025-03-31,ATH,38,36,10,15,2,0,0,...,,,,,,,,,,
1,20250401_CHC@ATH,2025-04-01,ATH,35,33,8,14,2,0,0,...,2.0,0.0,0.0,1.0,2.0,0.0,0.315789,0.138889,0.315789,0.138889
2,20250402_CHC@ATH,2025-04-02,ATH,36,31,5,9,5,0,0,...,4.0,0.0,0.0,3.0,2.0,0.0,0.30137,0.15942,0.30137,0.15942
3,20250407_SD@ATH,2025-04-07,ATH,38,36,10,17,2,0,0,...,9.0,0.0,0.0,4.0,3.0,0.0,0.277778,0.129032,0.293578,0.15
4,20250408_SD@ATH,2025-04-08,ATH,39,36,13,20,3,0,0,...,9.0,0.0,0.0,5.0,2.0,0.0,0.297297,0.164179,0.293578,0.17


 


Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,20220408_ATH@PHI,2022-04-08,ATH,35,34,6,13,0,1,0,...,,,,,,,,,,
1,20220409_ATH@PHI,2022-04-09,ATH,31,30,5,6,1,0,0,...,0.0,1.0,0.0,2.0,1.0,0.0,0.2,0.205882,0.2,0.205882
2,20220410_ATH@PHI,2022-04-10,ATH,39,35,9,14,3,1,0,...,1.0,1.0,0.0,2.0,2.0,0.0,0.19697,0.125,0.19697,0.125
3,20220411_ATH@TB,2022-04-11,ATH,45,41,13,29,3,1,0,...,4.0,2.0,0.0,3.0,4.0,0.0,0.247619,0.131313,0.247619,0.131313
4,20220412_ATH@TB,2022-04-12,ATH,45,35,8,12,9,1,0,...,7.0,3.0,0.0,7.0,6.0,1.0,0.313043,0.207547,0.286667,0.207143


Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,20230407_ATH@TB,2023-04-07,ATH,38,36,12,20,1,1,0,...,,,,,,,,,,
1,20230408_ATH@TB,2023-04-08,ATH,32,28,3,3,3,0,0,...,1.0,1.0,0.0,2.0,2.0,0.0,0.368421,0.222222,0.368421,0.222222
2,20230409_ATH@TB,2023-04-09,ATH,29,28,1,2,1,0,0,...,4.0,1.0,0.0,2.0,2.0,0.0,0.289855,0.125,0.289855,0.125
3,20230410_ATH@BAL,2023-04-10,ATH,35,32,8,12,3,0,0,...,5.0,1.0,0.0,2.0,3.0,0.0,0.22449,0.097826,0.22449,0.097826
4,20230411_ATH@BAL,2023-04-11,ATH,43,39,12,16,4,0,0,...,8.0,1.0,0.0,2.0,5.0,1.0,0.2,0.056818,0.24812,0.104839


Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,20240405_ATH@DET,2024-04-05,ATH,37,32,5,13,5,0,0,...,,,,,,,,,,
1,20240406_ATH@DET,2024-04-06,ATH,37,31,5,8,6,0,0,...,5.0,0.0,0.0,2.0,2.0,0.0,0.27027,0.25,0.27027,0.25
2,20240407_ATH@DET,2024-04-07,ATH,42,39,14,20,2,1,0,...,11.0,0.0,0.0,3.0,2.0,0.0,0.283784,0.174603,0.283784,0.174603
3,20240409_ATH@TEX,2024-04-09,ATH,35,32,5,14,3,0,0,...,13.0,1.0,0.0,4.0,3.0,1.0,0.35443,0.128571,0.327586,0.166667
4,20240410_ATH@TEX,2024-04-10,ATH,38,37,9,13,1,0,0,...,16.0,1.0,0.0,7.0,3.0,1.0,0.324675,0.211268,0.304636,0.19403


Unnamed: 0,game_id,game_date,batting_team,PA,AB,H,TB,BB,HBP,SF,...,roll_7D_BB,roll_7D_HBP,roll_7D_SF,roll_7D_HR,roll_7D__2B,roll_7D__3B,roll_3D_OBP,roll_3D_ISO,roll_7D_OBP,roll_7D_ISO
0,20250327_ATH@SEA,2025-03-27,ATH,30,29,3,9,1,0,0,...,,,,,,,,,,
1,20250328_ATH@SEA,2025-03-28,ATH,44,38,12,23,5,1,0,...,1.0,0.0,0.0,2.0,0.0,0.0,0.133333,0.206897,0.133333,0.206897
2,20250329_ATH@SEA,2025-03-29,ATH,38,36,9,12,2,0,0,...,6.0,1.0,0.0,5.0,2.0,0.0,0.297297,0.253731,0.297297,0.253731
3,20250330_ATH@SEA,2025-03-30,ATH,34,30,4,8,3,0,0,...,8.0,1.0,0.0,6.0,2.0,0.0,0.294643,0.194175,0.294643,0.194175
4,20250404_ATH@COL,2025-04-04,ATH,45,42,11,16,3,0,0,...,10.0,1.0,0.0,5.0,3.0,0.0,0.212121,0.133333,0.313043,0.173077


### Merging Home and Away Dataframes

This step merges the home and away rolling batting features into a single **game-level** DataFrame with **one row per `game_id`**. It first validates that both inputs contain the required columns and that each has exactly one row per game. It then renames the rolling metrics with `_home` and `_away` suffixes and merges them on `game_id`, producing columns like `roll_3D_OBP_home` and `roll_3D_OBP_away` (and the same for ISO and other windows).

This is an important step because placing both teams’ rolling metrics on the same row makes it straightforward to compute **home–away differences (deltas)** for modeling.


In [62]:
for y in range(2022, 2026):
    game_batting_rolls_name = f"game_batting_rolls_{y}"
    away_name = f"away_batting_{y}"
    home_name = f"home_batting_{y}"

    globals()[game_batting_rolls_name] = combine_home_away_batting_rolls(
        home_batting=globals()[home_name],
        away_batting=globals()[away_name],
        metrics=("roll_3D_OBP", "roll_3D_ISO", "roll_7D_OBP", "roll_7D_ISO"),
    )

In [63]:
display(HTML("<h4>Season 2022</h4>")); display(game_batting_rolls_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(game_batting_rolls_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(game_batting_rolls_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(game_batting_rolls_2025.head(5))

Unnamed: 0,game_id,game_date,roll_3D_OBP_home,roll_3D_ISO_home,roll_7D_OBP_home,roll_7D_ISO_home,roll_3D_OBP_away,roll_3D_ISO_away,roll_7D_OBP_away,roll_7D_ISO_away
0,20220407_CIN@ATL,2022-04-07,,,,,,,,
1,20220407_CLE@KC,2022-04-07,,,,,,,,
2,20220407_HOU@LAA,2022-04-07,,,,,,,,
3,20220407_MIL@CHC,2022-04-07,,,,,,,,
4,20220407_NYM@WSH,2022-04-07,,,,,,,,


Unnamed: 0,game_id,game_date,roll_3D_OBP_home,roll_3D_ISO_home,roll_7D_OBP_home,roll_7D_ISO_home,roll_3D_OBP_away,roll_3D_ISO_away,roll_7D_OBP_away,roll_7D_ISO_away
0,20230330_ATL@WSH,2023-03-30,,,,,,,,
1,20230330_AZ@LAD,2023-03-30,,,,,,,,
2,20230330_BAL@BOS,2023-03-30,,,,,,,,
3,20230330_CLE@SEA,2023-03-30,,,,,,,,
4,20230330_COL@SD,2023-03-30,,,,,,,,


Unnamed: 0,game_id,game_date,roll_3D_OBP_home,roll_3D_ISO_home,roll_7D_OBP_home,roll_7D_ISO_home,roll_3D_OBP_away,roll_3D_ISO_away,roll_7D_OBP_away,roll_7D_ISO_away
0,20240320_LAD@SD,2024-03-20,,,,,,,,
1,20240321_SD@LAD,2024-03-21,,,,,,,,
2,20240328_BOS@SEA,2024-03-28,,,,,,,,
3,20240328_CHC@TEX,2024-03-28,,,,,,,,
4,20240328_CLE@ATH,2024-03-28,,,,,,,,


Unnamed: 0,game_id,game_date,roll_3D_OBP_home,roll_3D_ISO_home,roll_7D_OBP_home,roll_7D_ISO_home,roll_3D_OBP_away,roll_3D_ISO_away,roll_7D_OBP_away,roll_7D_ISO_away
0,20250318_LAD@CHC,2025-03-18,,,,,,,,
1,20250319_LAD@CHC,2025-03-19,0.15625,0.033333,0.15625,0.033333,0.357143,0.058824,0.357143,0.058824
2,20250327_ATH@SEA,2025-03-27,,,,,,,,
3,20250327_ATL@SD,2025-03-27,,,,,,,,
4,20250327_BAL@TOR,2025-03-27,,,,,,,,


### Calculating Difference Between Home and Away

This step creates a new game-level table of **batting deltas** by subtracting the away team’s rolling metrics from the home team’s rolling metrics (**home − away**). It expects the combined game-level batting DataFrame to contain paired columns like `roll_3D_OBP_home` / `roll_3D_OBP_away` (and similarly for ISO and other windows). The output keeps `game_id` and `game_date` and adds delta columns named like `Δroll_3D_OBP`, `Δroll_3D_ISO`, `Δroll_7D_OBP`, and `Δroll_7D_ISO`, which are used as matchup features for modeling.


In [64]:
for y in range(2022, 2026):
    rolls_name = f"game_batting_rolls_{y}"
    deltas_name = f"game_batting_deltas_{y}"

    globals()[deltas_name] = make_batting_delta_df(
        game_batting_rolls=globals()[rolls_name],
        metrics=("roll_3D_OBP", "roll_3D_ISO", "roll_7D_OBP", "roll_7D_ISO"),
    )

In [65]:
display(HTML("<h4>Season 2022</h4>")); display(game_batting_deltas_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(game_batting_deltas_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(game_batting_deltas_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(game_batting_deltas_2025.head(5))

Unnamed: 0,game_id,game_date,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO
0,20220407_CIN@ATL,2022-04-07,,,,
1,20220407_CLE@KC,2022-04-07,,,,
2,20220407_HOU@LAA,2022-04-07,,,,
3,20220407_MIL@CHC,2022-04-07,,,,
4,20220407_NYM@WSH,2022-04-07,,,,


Unnamed: 0,game_id,game_date,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO
0,20230330_ATL@WSH,2023-03-30,,,,
1,20230330_AZ@LAD,2023-03-30,,,,
2,20230330_BAL@BOS,2023-03-30,,,,
3,20230330_CLE@SEA,2023-03-30,,,,
4,20230330_COL@SD,2023-03-30,,,,


Unnamed: 0,game_id,game_date,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO
0,20240320_LAD@SD,2024-03-20,,,,
1,20240321_SD@LAD,2024-03-21,,,,
2,20240328_BOS@SEA,2024-03-28,,,,
3,20240328_CHC@TEX,2024-03-28,,,,
4,20240328_CLE@ATH,2024-03-28,,,,


Unnamed: 0,game_id,game_date,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO
0,20250318_LAD@CHC,2025-03-18,,,,
1,20250319_LAD@CHC,2025-03-19,-0.200893,-0.02549,-0.200893,-0.02549
2,20250327_ATH@SEA,2025-03-27,,,,
3,20250327_ATL@SD,2025-03-27,,,,
4,20250327_BAL@TOR,2025-03-27,,,,


## Combining Pitcher and Batta Data

Now, we can finally combine pitcher and batter data. 

In [66]:
# ---- build combined per-year feature tables ----
for y in range(2022, 2026):
    globals()[f"game_features_{y}"] = combine_pitching_batting_deltas(
        pitching_deltas=globals()[f"game_pitching_deltas_{y}"],
        batting_deltas=globals()[f"game_batting_deltas_{y}"],
        how="inner",
    )


In [67]:
display(HTML("<h4>Season 2022</h4>")); display(game_features_2022.head(5))
display(HTML("<h4>Season 2023</h4>")); display(game_features_2023.head(5))
display(HTML("<h4>Season 2024</h4>")); display(game_features_2024.head(5))
display(HTML("<h4>Season 2025</h4>")); display(game_features_2025.head(5))

Unnamed: 0,game_id,game_date,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,Δbullpen_FIP_3D,Δstarter_FIP_7D,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO
0,20220407_CIN@ATL,2022-04-07,"Fried, Max","Mahle, Tyler",,,,,,,,,,,,,,
1,20220407_CLE@KC,2022-04-07,"Greinke, Zack","Bieber, Shane",,,,,,,,,,,,,,
2,20220407_HOU@LAA,2022-04-07,"Ohtani, Shohei","Valdez, Framber",,,,,,,,,,,,,,
3,20220407_MIL@CHC,2022-04-07,"Hendricks, Kyle","Burnes, Corbin",,,,,,,,,,,,,,
4,20220407_NYM@WSH,2022-04-07,"Corbin, Patrick","Megill, Tylor",,,,,,,,,,,,,,


Unnamed: 0,game_id,game_date,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,Δbullpen_FIP_3D,Δstarter_FIP_7D,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO
0,20230330_ATL@WSH,2023-03-30,"Corbin, Patrick","Fried, Max",2.167879,0.726061,-0.665455,1.030909,0.975784,2.167879,0.726061,-0.665455,1.030909,0.975784,,,,
1,20230330_AZ@LAD,2023-03-30,"Urías, Julio","Gallen, Zac",0.670662,0.005336,-0.909239,0.448819,-1.126804,0.670662,0.005336,-0.909239,0.448819,-1.126804,,,,
2,20230330_BAL@BOS,2023-03-30,"Kluber, Corey","Gibson, Kyle",-0.710781,-0.110831,-0.069085,-0.187803,0.506586,-0.710781,-0.110831,-0.069085,-0.187803,0.506586,,,,
3,20230330_CLE@SEA,2023-03-30,"Castillo, Luis","Bieber, Shane",0.202156,0.087045,1.087248,-0.032947,0.298487,0.202156,0.087045,1.087248,-0.032947,0.298487,,,,
4,20230330_COL@SD,2023-03-30,"Snell, Blake","Márquez, Germán",-1.935456,-0.171018,4.636085,-0.721204,-0.408109,-1.935456,-0.171018,4.636085,-0.721204,-0.408109,,,,


Unnamed: 0,game_id,game_date,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,Δbullpen_FIP_3D,Δstarter_FIP_7D,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO
0,20240320_LAD@SD,2024-03-20,"Darvish, Yu","Glasnow, Tyler",1.100972,0.27806,-2.70335,0.230955,0.49918,1.100972,0.27806,-2.70335,0.230955,0.49918,,,,
1,20240321_SD@LAD,2024-03-21,"Yamamoto, Yoshinobu","Musgrove, Joe",0.913134,0.12268,-0.48661,0.407219,-3.9375,0.913134,0.12268,-0.48661,0.407219,-3.9375,,,,
2,20240328_BOS@SEA,2024-03-28,"Castillo, Luis","Bello, Brayan",-0.743401,-0.266235,2.410068,-0.10726,-0.410919,-0.743401,-0.266235,2.410068,-0.10726,-0.410919,,,,
3,20240328_CHC@TEX,2024-03-28,"Eovaldi, Nathan","Steele, Justin",0.871255,-0.019033,-0.807548,0.220974,0.379042,0.871255,-0.019033,-0.807548,0.220974,0.379042,,,,
4,20240328_CLE@ATH,2024-03-28,"Wood, Alex","Bieber, Shane",1.642781,0.475976,-0.141701,0.347347,0.800054,1.642781,0.475976,-0.141701,0.347347,0.800054,,,,


Unnamed: 0,game_id,game_date,starter_pitcher_name_home,starter_pitcher_name_away,Δstarter_FIP_3D,Δstarter_WHIP_3D,Δstarter_K9_3D,Δstarter_HR9_3D,Δbullpen_FIP_3D,Δstarter_FIP_7D,Δstarter_WHIP_7D,Δstarter_K9_7D,Δstarter_HR9_7D,Δbullpen_FIP_7D,Δroll_3D_OBP,Δroll_3D_ISO,Δroll_7D_OBP,Δroll_7D_ISO
0,20250318_LAD@CHC,2025-03-18,"Imanaga, Shota","Yamamoto, Yoshinobu",1.123302,-0.073846,-1.38462,0.721391,-0.099229,1.123302,-0.073846,-1.38462,0.721391,-0.099229,,,,
1,20250319_LAD@CHC,2025-03-19,"Steele, Justin","Sasaki, Roki",-0.934686,-0.190904,0.61794,-0.423393,1.35,-0.934686,-0.190904,0.61794,-0.423393,1.35,-0.200893,-0.02549,-0.200893,-0.02549
2,20250327_ATH@SEA,2025-03-27,"Gilbert, Logan","Severino, Luis",-0.960329,-0.41391,1.439538,-0.029619,0.093181,-0.960329,-0.41391,1.439538,-0.029619,0.093181,,,,
3,20250327_ATL@SD,2025-03-27,"King, Michael","Sale, Chris",1.198333,0.157857,-1.071429,0.394286,0.190382,1.198333,0.157857,-1.071429,0.394286,0.190382,,,,
4,20250327_BAL@TOR,2025-03-27,"Berríos, José","Eflin, Zach",0.97151,0.066755,-0.065855,0.268723,0.973941,0.97151,0.066755,-0.065855,0.268723,0.973941,,,,
