In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import joblib

In [2]:
DATA_PATH = "../data/cleaned/ipl_cleaned_ball_by_ball.csv"

ipl = pd.read_csv(DATA_PATH, parse_dates=["date"])

print("Original shape:", ipl.shape)

Original shape: (260759, 22)


In [3]:
ipl = ipl.sort_values(["bowler", "date"])

In [4]:
ipl["pvp_wickets_avg"] = (
    ipl.groupby(["bowler", "batter"])["is_wicket"]
    .transform(lambda x: x.shift().expanding().mean())
)

In [5]:
bowler_df = (
    ipl.groupby(
        ["id", "date", "bowler", "bowling_team", "batting_team", "venue"]
    )
    .agg(
        wickets_taken=("is_wicket", "sum"),
        balls_bowled=("is_wicket", "count"),
        runs_conceded=("total_runs", "sum"),
        pvp_match_avg=("pvp_wickets_avg", "mean")
    )
    .reset_index()
)

bowler_df.rename(columns={"id": "match_id"}, inplace=True)

print("Aggregated shape:", bowler_df.shape)

Aggregated shape: (12978, 10)


In [6]:
bowler_df = bowler_df.sort_values(["bowler", "date"])

In [7]:
bowler_df["wickets_avg_last_5"] = (
    bowler_df.groupby("bowler")["wickets_taken"]
    .transform(lambda x: x.shift().rolling(5, min_periods=1).mean())
)

bowler_df["wickets_avg_last_10"] = (
    bowler_df.groupby("bowler")["wickets_taken"]
    .transform(lambda x: x.shift().rolling(10, min_periods=1).mean())
)

bowler_df["economy_last_5"] = (
    bowler_df.groupby("bowler")["runs_conceded"]
    .transform(lambda x: x.shift().rolling(5, min_periods=1).mean())
)

In [8]:
bowler_df["career_wickets"] = (
    bowler_df.groupby("bowler")["wickets_taken"]
    .cumsum() - bowler_df["wickets_taken"]
)

bowler_df["career_matches"] = (
    bowler_df.groupby("bowler").cumcount()
)

bowler_df["career_wicket_avg"] = (
    bowler_df["career_wickets"] /
    bowler_df["career_matches"].replace(0, np.nan)
)

In [9]:
bowler_df["venue_wicket_avg"] = (
    bowler_df.groupby(["bowler", "venue"])["wickets_taken"]
    .transform(lambda x: x.shift().expanding().mean())
)

In [10]:
bowler_df["pvt_wicket_avg"] = (
    bowler_df.groupby(["bowler", "batting_team"])["wickets_taken"]
    .transform(lambda x: x.shift().expanding().mean())
)

In [11]:
bowler_df["wickets_next_match"] = (
    bowler_df.groupby("bowler")["wickets_taken"]
    .shift(-1)
)

bowler_df = bowler_df.dropna(subset=["wickets_next_match"])

In [12]:
numeric_cols = [
    "wickets_avg_last_5",
    "wickets_avg_last_10",
    "career_wicket_avg",
    "venue_wicket_avg",
    "pvt_wicket_avg",
    "pvp_match_avg"
]

bowler_df[numeric_cols] = bowler_df[numeric_cols].fillna(0)

In [13]:
final_cols = [
    "match_id",
    "date",
    "bowler",
    "bowling_team",
    "batting_team",
    "venue",
    "wickets_avg_last_5",
    "wickets_avg_last_10",
    "career_wicket_avg",
    "venue_wicket_avg",
    "pvt_wicket_avg",
    "pvp_match_avg",
    "wickets_next_match"
]

bowler_model_df = bowler_df[final_cols].copy()

print("Final shape:", bowler_model_df.shape)

Final shape: (12448, 13)


In [14]:
PROCESSED_DIR = Path("../data/processed")
PROCESSED_DIR.mkdir(exist_ok=True)

bowler_model_df.to_csv(PROCESSED_DIR / "bowler_model_data.csv", index=False)

print("✅ bowler_model_data.csv saved")

✅ bowler_model_data.csv saved


In [15]:
target = "wickets_next_match"

categorical_features = [
    "bowling_team",
    "batting_team",
    "venue"
]

numeric_features = [
    "wickets_avg_last_5",
    "wickets_avg_last_10",
    "career_wicket_avg",
    "venue_wicket_avg",
    "pvt_wicket_avg",
    "pvp_match_avg"
]

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

X = bowler_model_df.drop(columns=[target, "match_id", "date", "bowler"])
y = bowler_model_df[target]

preprocessor.fit(X)

ARTIFACT_DIR = Path("../model")
ARTIFACT_DIR.mkdir(exist_ok=True)

joblib.dump(preprocessor, ARTIFACT_DIR / "feature_pipeline_bowler.pkl")

print("✅ feature_pipeline_bowler.pkl saved")

✅ feature_pipeline_bowler.pkl saved


In [17]:
split_date = "2023-01-01"

train_df = bowler_model_df[bowler_model_df["date"] < split_date]
test_df  = bowler_model_df[bowler_model_df["date"] >= split_date]

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (10878, 13)
Test shape: (1570, 13)
