In [23]:
#Load Libraries&Data
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib


In [24]:
ball_df = pd.read_csv("ball_by_ball.csv")
match_df = pd.read_csv("matches.csv")

print(ball_df.shape)
print(match_df.shape)
ball_df.head()


(260920, 17)
(1095, 20)


Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,


In [25]:
#Preparing  Match Metadata
match_df["date"] = pd.to_datetime(match_df["date"])

match_df = match_df[["id", "date", "venue", "team1", "team2"]] \
            .rename(columns={"id": "match_id"})

match_df.head()


Unnamed: 0,match_id,date,venue,team1,team2
0,335982,2008-04-18,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders
1,335983,2008-04-19,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings
2,335984,2008-04-19,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals
3,335985,2008-04-20,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore
4,335986,2008-04-20,Eden Gardens,Kolkata Knight Riders,Deccan Chargers


In [26]:
#Aggregate Ball-by-Ball â†’ Player-Match Level
#Batsman Aggregation
batsman_match = (
    ball_df.groupby(["match_id", "batter", "batting_team"])
    .agg(
        runs=("batsman_runs", "sum"),
        balls_faced=("ball", "count"),
        fours=("batsman_runs", lambda x: (x == 4).sum()),
        sixes=("batsman_runs", lambda x: (x == 6).sum())
    )
    .reset_index()
)

batsman_match.head()


Unnamed: 0,match_id,batter,batting_team,runs,balls_faced,fours,sixes
0,335982,AA Noffke,Royal Challengers Bangalore,9,12,1,0
1,335982,B Akhil,Royal Challengers Bangalore,0,2,0,0
2,335982,BB McCullum,Kolkata Knight Riders,158,77,10,13
3,335982,CL White,Royal Challengers Bangalore,6,10,0,0
4,335982,DJ Hussey,Kolkata Knight Riders,12,12,1,0


In [27]:
#Bowler Aggregation
bowler_match = (
    ball_df.groupby(["match_id", "bowler", "bowling_team"])
    .agg(
        wickets=("is_wicket", "sum"),
        runs_conceded=("total_runs", "sum"),
        balls_bowled=("ball", "count")
    )
    .reset_index()
)

bowler_match.head()


Unnamed: 0,match_id,bowler,bowling_team,wickets,runs_conceded,balls_bowled
0,335982,AA Noffke,Royal Challengers Bangalore,1,41,25
1,335982,AB Agarkar,Kolkata Knight Riders,3,25,28
2,335982,AB Dinda,Kolkata Knight Riders,2,9,20
3,335982,CL White,Royal Challengers Bangalore,0,24,7
4,335982,I Sharma,Kolkata Knight Riders,1,13,19


In [28]:
#Merge Match Info & Create Opponent
batsman_match = batsman_match.merge(match_df, on="match_id", how="left")
bowler_match = bowler_match.merge(match_df, on="match_id", how="left")


In [29]:
batsman_match["opponent_team"] = np.where(
    batsman_match["batting_team"] == batsman_match["team1"],
    batsman_match["team2"],
    batsman_match["team1"]
)

bowler_match["opponent_team"] = np.where(
    bowler_match["bowling_team"] == bowler_match["team1"],
    bowler_match["team2"],
    bowler_match["team1"]
)


In [30]:
#Sort for Time-Series Processing
batsman_match = batsman_match.sort_values(["batter", "date"])
bowler_match = bowler_match.sort_values(["bowler", "date"])


In [31]:
#Rolling Averages (Recent Form)
#Batsmen
for w in [5, 10]:
    batsman_match[f"runs_avg_last_{w}"] = (
        batsman_match.groupby("batter")["runs"]
        .transform(lambda x: x.shift(1).rolling(w).mean())
    )


In [32]:
#Bowlers
for w in [5, 10]:
    bowler_match[f"wickets_avg_last_{w}"] = (
        bowler_match.groupby("bowler")["wickets"]
        .transform(lambda x: x.shift(1).rolling(w).mean())
    )


In [33]:
#Venue & Opponent-Specific Stats (PvT)
batsman_match["venue_avg_runs"] = (
    batsman_match.groupby(["batter", "venue"])["runs"].transform("mean")
)

batsman_match["opponent_avg_runs"] = (
    batsman_match.groupby(["batter", "opponent_team"])["runs"].transform("mean")
)




In [34]:

bowler_match["venue_avg_wickets"] = (
    bowler_match.groupby(["bowler", "venue"])["wickets"].transform("mean")
)

bowler_match["opponent_avg_wickets"] = (
    bowler_match.groupby(["bowler", "opponent_team"])["wickets"].transform("mean")
)


In [46]:
#Career Statistics
batsman_match["career_matches"] = batsman_match.groupby("batter").cumcount()
batsman_match["career_avg_runs"] = (
    batsman_match.groupby("batter")["runs"]
    .transform(lambda x: x.shift(1).expanding().mean())
)


In [36]:

bowler_match["career_matches"] = bowler_match.groupby("bowler").cumcount()
bowler_match["career_avg_wickets"] = (
    bowler_match.groupby("bowler")["wickets"]
    .transform(lambda x: x.shift(1).expanding().mean())
)


In [37]:
#Target Variable (Next Match Performance)
batsman_match["target_runs_next_match"] = (
    batsman_match.groupby("batter")["runs"].shift(-1)
)

bowler_match["target_wickets_next_match"] = (
    bowler_match.groupby("bowler")["wickets"].shift(-1)
)


In [38]:
#Handle Missing Values
batsman_match.fillna(0, inplace=True)
bowler_match.fillna(0, inplace=True)


In [39]:
#Final Feature-Engineered Dataset
dataset = batsman_match[
    [
        "batter", "venue", "opponent_team",
        "runs_avg_last_5", "runs_avg_last_10",
        "venue_avg_runs", "opponent_avg_runs",
        "career_avg_runs", "career_matches",
        "target_runs_next_match"
    ]
]

dataset.head()


Unnamed: 0,batter,venue,opponent_team,runs_avg_last_5,runs_avg_last_10,venue_avg_runs,opponent_avg_runs,career_avg_runs,career_matches,target_runs_next_match
4299,A Ashish Reddy,Wankhede Stadium,Mumbai Indians,0.0,0.0,10.0,13.5,0.0,0,3.0
4390,A Ashish Reddy,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,0.0,0.0,19.5,15.0,10.0,1,8.0
4496,A Ashish Reddy,"Rajiv Gandhi International Stadium, Uppal",Kings XI Punjab,0.0,0.0,8.454545,12.333333,6.5,2,10.0
4699,A Ashish Reddy,"Rajiv Gandhi International Stadium, Uppal",Rajasthan Royals,0.0,0.0,8.454545,12.333333,7.0,3,4.0
4747,A Ashish Reddy,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,0.0,0.0,8.454545,11.0,7.75,4,7.0


In [40]:
#Train-Test Split (Time-Series Aware)
train = dataset.iloc[:-1000]
test = dataset.iloc[-1000:]

print(train.shape, test.shape)


(15515, 10) (1000, 10)


In [41]:
#Preprocessing Pipeline (Deliverable)
num_features = [
    "runs_avg_last_5", "runs_avg_last_10",
    "venue_avg_runs", "opponent_avg_runs",
    "career_avg_runs", "career_matches"
]

cat_features = ["venue", "opponent_team"]

feature_pipeline = ColumnTransformer(
    [
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ]
)


In [42]:
joblib.dump(feature_pipeline, "feature_pipeline.pkl")
print("Preprocessor saved")


Preprocessor saved


In [43]:
#Save Final Dataset (Deliverable)
dataset.to_csv("dataset.csv", index=False)
print("Feature-engineered dataset saved")


Feature-engineered dataset saved
