In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

In [2]:
DATA_PATH = "../data/cleaned/ipl_cleaned_ball_by_ball.csv"

ipl = pd.read_csv(DATA_PATH, parse_dates=["date"])

print("Original shape:", ipl.shape)
ipl.head()

Original shape: (260759, 22)


Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,...,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder,id,date,venue,team1,team2
0,335982,1,kolkata knight riders,royal challengers bengaluru,0,1,SC Ganguly,P Kumar,BB McCullum,0,...,legbyes,0,,,,335982,2008-04-18,M Chinnaswamy Stadium,royal challengers bengaluru,kolkata knight riders
1,335982,1,kolkata knight riders,royal challengers bengaluru,0,2,BB McCullum,P Kumar,SC Ganguly,0,...,,0,,,,335982,2008-04-18,M Chinnaswamy Stadium,royal challengers bengaluru,kolkata knight riders
2,335982,1,kolkata knight riders,royal challengers bengaluru,0,3,BB McCullum,P Kumar,SC Ganguly,0,...,wides,0,,,,335982,2008-04-18,M Chinnaswamy Stadium,royal challengers bengaluru,kolkata knight riders
3,335982,1,kolkata knight riders,royal challengers bengaluru,0,4,BB McCullum,P Kumar,SC Ganguly,0,...,,0,,,,335982,2008-04-18,M Chinnaswamy Stadium,royal challengers bengaluru,kolkata knight riders
4,335982,1,kolkata knight riders,royal challengers bengaluru,0,5,BB McCullum,P Kumar,SC Ganguly,0,...,,0,,,,335982,2008-04-18,M Chinnaswamy Stadium,royal challengers bengaluru,kolkata knight riders


In [3]:
# Sort properly first
ipl = ipl.sort_values(["batter", "date"])

# Player vs Bowler historical average (no leakage)
ipl["pvp_runs_avg"] = (
    ipl.groupby(["batter", "bowler"])["batsman_runs"]
    .transform(lambda x: x.shift().expanding().mean())
)

In [4]:
batsman_df = (
    ipl.groupby(
        ["id", "date", "batter", "batting_team", "bowling_team", "venue"]
    )
    .agg(
        runs_scored=("batsman_runs", "sum"),
        balls_faced=("batsman_runs", "count"),
        pvp_match_avg=("pvp_runs_avg", "mean")
    )
    .reset_index()
)

batsman_df.rename(columns={"id": "match_id"}, inplace=True)

print("Aggregated shape:", batsman_df.shape)

Aggregated shape: (16515, 9)


In [5]:
batsman_df = batsman_df.sort_values(["batter", "date"])

batsman_df["runs_avg_last_5"] = (
    batsman_df.groupby("batter")["runs_scored"]
    .transform(lambda x: x.shift().rolling(5, min_periods=1).mean())
)

batsman_df["runs_avg_last_10"] = (
    batsman_df.groupby("batter")["runs_scored"]
    .transform(lambda x: x.shift().rolling(10, min_periods=1).mean())
)

In [6]:
batsman_df["career_runs"] = (
    batsman_df.groupby("batter")["runs_scored"]
    .cumsum() - batsman_df["runs_scored"]
)

batsman_df["career_matches"] = (
    batsman_df.groupby("batter").cumcount()
)

batsman_df["career_avg"] = (
    batsman_df["career_runs"] /
    batsman_df["career_matches"].replace(0, np.nan)
)

In [7]:
batsman_df["venue_avg"] = (
    batsman_df.groupby(["batter", "venue"])["runs_scored"]
    .transform(lambda x: x.shift().expanding().mean())
)

In [8]:
batsman_df["pvt_avg"] = (
    batsman_df.groupby(["batter", "bowling_team"])["runs_scored"]
    .transform(lambda x: x.shift().expanding().mean())
)

In [9]:
batsman_df["runs_next_match"] = (
    batsman_df.groupby("batter")["runs_scored"]
    .shift(-1)
)

batsman_df = batsman_df.dropna(subset=["runs_next_match"])

In [10]:
numeric_cols = [
    "runs_avg_last_5",
    "runs_avg_last_10",
    "career_avg",
    "venue_avg",
    "pvt_avg",
    "pvp_match_avg"
]

batsman_df[numeric_cols] = batsman_df[numeric_cols].fillna(0)

In [11]:
final_cols = [
    "match_id",
    "date",
    "batter",
    "batting_team",
    "bowling_team",
    "venue",
    "runs_avg_last_5",
    "runs_avg_last_10",
    "career_avg",
    "venue_avg",
    "pvt_avg",
    "pvp_match_avg",
    "runs_next_match"
]

batsman_model_df = batsman_df[final_cols].copy()

print("Final shape:", batsman_model_df.shape)

Final shape: (15842, 13)


In [12]:
batsman_model_df.to_csv("../data/processed/batsman_model_data.csv", index=False)
print("✅ Dataset saved")

✅ Dataset saved


In [13]:
target = "runs_next_match"

categorical_features = [
    "batting_team",
    "bowling_team",
    "venue"
]

numeric_features = [
    "runs_avg_last_5",
    "runs_avg_last_10",
    "career_avg",
    "venue_avg",
    "pvt_avg",
    "pvp_match_avg"
]

In [14]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

X = batsman_model_df.drop(columns=[target, "match_id", "date", "batter"])
y = batsman_model_df[target]

preprocessor.fit(X)

joblib.dump(preprocessor, "../model/feature_pipeline_batsman.pkl")

print("✅ Pipeline saved")

✅ Pipeline saved
