# IPL Feature Engineering – Player Match Level

## Week 3–4 Tasks:
1. Aggregate ball-by-ball data to player–match level  
2. Engineer basic player form features (rolling averages)

This notebook builds features required for ML model development
using cleaned IPL ball-by-ball data.

In [23]:
# Inspect column names to avoid KeyErrors
ipl_df.columns.tolist()


['matchId',
 'inning',
 'over_ball',
 'over',
 'ball',
 'batting_team',
 'bowling_team',
 'batsman',
 'non_striker',
 'bowler',
 'batsman_runs',
 'extras',
 'isWide',
 'isNoBall',
 'Byes',
 'LegByes',
 'Penalty',
 'dismissal_kind',
 'player_dismissed',
 'date_x',
 'eliminator',
 'team1',
 'neutralvenue',
 'balls_per_over',
 'umpire2',
 'umpire1',
 'outcome',
 'venue',
 'date1',
 'date2',
 'method',
 'date_y',
 'team2',
 'player_of_match',
 'winner_wickets',
 'winner_runs',
 'reserve_umpire',
 'season',
 'city',
 'winner',
 'match_number',
 'event',
 'gender',
 'match_referee',
 'tv_umpire',
 'toss_winner',
 'toss_decision',
 'total_runs']

2 – Import Libraries

In [24]:
import pandas as pd
import numpy as np
import os

pd.set_option("display.max_columns", None)


3 – Paths

In [25]:
BASE_DIR = os.getcwd()

CLEANED_DATA_PATH = os.path.join(
    BASE_DIR, "..", "data", "cleaned", "ipl_merged_cleaned.csv"
)

FEATURE_DIR = os.path.join(BASE_DIR, "..", "data", "features")
os.makedirs(FEATURE_DIR, exist_ok=True)


4 – Load Cleaned Dataset

In [26]:
ipl_df = pd.read_csv(CLEANED_DATA_PATH)

print("Dataset loaded successfully")
print("Shape:", ipl_df.shape)
ipl_df.head()


  ipl_df = pd.read_csv(CLEANED_DATA_PATH)


Dataset loaded successfully
Shape: (260920, 48)


Unnamed: 0,matchId,inning,over_ball,over,ball,batting_team,bowling_team,batsman,non_striker,bowler,batsman_runs,extras,isWide,isNoBall,Byes,LegByes,Penalty,dismissal_kind,player_dismissed,date_x,eliminator,team1,neutralvenue,balls_per_over,umpire2,umpire1,outcome,venue,date1,date2,method,date_y,team2,player_of_match,winner_wickets,winner_runs,reserve_umpire,season,city,winner,match_number,event,gender,match_referee,tv_umpire,toss_winner,toss_decision,total_runs
0,335982,1,0.1,0,1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,0,1,1.0,1.0,1.0,1.0,5.0,,,2008-04-18,Kings XI Punjab,Royal Challengers Bangalore,True,6,RE Koertzen,Asad Rauf,tie,M Chinnaswamy Stadium,2014/05/27,2014/05/28,D/L,2008-04-18,Kolkata Knight Riders,BB McCullum,6.0,140.0,VN Kulkarni,2007/08,Bangalore,Kolkata Knight Riders,1.0,Indian Premier League,male,J Srinath,AM Saheba,Royal Challengers Bangalore,field,1
1,335982,1,0.2,0,2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,1.0,1.0,1.0,1.0,5.0,,,2008-04-18,Kings XI Punjab,Royal Challengers Bangalore,True,6,RE Koertzen,Asad Rauf,tie,M Chinnaswamy Stadium,2014/05/27,2014/05/28,D/L,2008-04-18,Kolkata Knight Riders,BB McCullum,6.0,140.0,VN Kulkarni,2007/08,Bangalore,Kolkata Knight Riders,1.0,Indian Premier League,male,J Srinath,AM Saheba,Royal Challengers Bangalore,field,0
2,335982,1,0.3,0,3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,1,1.0,1.0,1.0,1.0,5.0,,,2008-04-18,Kings XI Punjab,Royal Challengers Bangalore,True,6,RE Koertzen,Asad Rauf,tie,M Chinnaswamy Stadium,2014/05/27,2014/05/28,D/L,2008-04-18,Kolkata Knight Riders,BB McCullum,6.0,140.0,VN Kulkarni,2007/08,Bangalore,Kolkata Knight Riders,1.0,Indian Premier League,male,J Srinath,AM Saheba,Royal Challengers Bangalore,field,1
3,335982,1,0.4,0,4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,1.0,1.0,1.0,1.0,5.0,,,2008-04-18,Kings XI Punjab,Royal Challengers Bangalore,True,6,RE Koertzen,Asad Rauf,tie,M Chinnaswamy Stadium,2014/05/27,2014/05/28,D/L,2008-04-18,Kolkata Knight Riders,BB McCullum,6.0,140.0,VN Kulkarni,2007/08,Bangalore,Kolkata Knight Riders,1.0,Indian Premier League,male,J Srinath,AM Saheba,Royal Challengers Bangalore,field,0
4,335982,1,0.5,0,5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,1.0,1.0,1.0,1.0,5.0,,,2008-04-18,Kings XI Punjab,Royal Challengers Bangalore,True,6,RE Koertzen,Asad Rauf,tie,M Chinnaswamy Stadium,2014/05/27,2014/05/28,D/L,2008-04-18,Kolkata Knight Riders,BB McCullum,6.0,140.0,VN Kulkarni,2007/08,Bangalore,Kolkata Knight Riders,1.0,Indian Premier League,male,J Srinath,AM Saheba,Royal Challengers Bangalore,field,0


5 – Select Required Columns

In [27]:
batting_df = ipl_df[[
    "matchId",
    "season",
    "batsman",
    "batting_team",
    "bowling_team",
    "venue",
    "batsman_runs",
    "ball"
]]

batting_df.head()


Unnamed: 0,matchId,season,batsman,batting_team,bowling_team,venue,batsman_runs,ball
0,335982,2007/08,SC Ganguly,Kolkata Knight Riders,Royal Challengers Bangalore,M Chinnaswamy Stadium,0,1
1,335982,2007/08,BB McCullum,Kolkata Knight Riders,Royal Challengers Bangalore,M Chinnaswamy Stadium,0,2
2,335982,2007/08,BB McCullum,Kolkata Knight Riders,Royal Challengers Bangalore,M Chinnaswamy Stadium,0,3
3,335982,2007/08,BB McCullum,Kolkata Knight Riders,Royal Challengers Bangalore,M Chinnaswamy Stadium,0,4
4,335982,2007/08,BB McCullum,Kolkata Knight Riders,Royal Challengers Bangalore,M Chinnaswamy Stadium,0,5


6 – Aggregate Ball-by-Ball → Player-Match Level

In [28]:
player_match_df = (
    batting_df
    .groupby(["matchId", "batsman"])
    .agg(
        runs_scored=("batsman_runs", "sum"),
        balls_faced=("ball", "count"),
        fours=("batsman_runs", lambda x: (x == 4).sum()),
        sixes=("batsman_runs", lambda x: (x == 6).sum()),
        team=("batting_team", "first"),
        opponent=("bowling_team", "first"),
        venue=("venue", "first"),
        season=("season", "first")
    )
    .reset_index()
)

player_match_df.head()


Unnamed: 0,matchId,batsman,runs_scored,balls_faced,fours,sixes,team,opponent,venue,season
0,335982,AA Noffke,9,12,1,0,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,2007/08
1,335982,B Akhil,0,2,0,0,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,2007/08
2,335982,BB McCullum,158,77,10,13,Kolkata Knight Riders,Royal Challengers Bangalore,M Chinnaswamy Stadium,2007/08
3,335982,CL White,6,10,0,0,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,2007/08
4,335982,DJ Hussey,12,12,1,0,Kolkata Knight Riders,Royal Challengers Bangalore,M Chinnaswamy Stadium,2007/08


7 – Create Strike Rate Feature

In [29]:
player_match_df["strike_rate"] = (
    player_match_df["runs_scored"] / player_match_df["balls_faced"]
) * 100

player_match_df.head()


Unnamed: 0,matchId,batsman,runs_scored,balls_faced,fours,sixes,team,opponent,venue,season,strike_rate
0,335982,AA Noffke,9,12,1,0,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,2007/08,75.0
1,335982,B Akhil,0,2,0,0,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,2007/08,0.0
2,335982,BB McCullum,158,77,10,13,Kolkata Knight Riders,Royal Challengers Bangalore,M Chinnaswamy Stadium,2007/08,205.194805
3,335982,CL White,6,10,0,0,Royal Challengers Bangalore,Kolkata Knight Riders,M Chinnaswamy Stadium,2007/08,60.0
4,335982,DJ Hussey,12,12,1,0,Kolkata Knight Riders,Royal Challengers Bangalore,M Chinnaswamy Stadium,2007/08,100.0


8 – Sort for Time-Series Operations

In [30]:
player_match_df = player_match_df.sort_values(
    ["batsman", "matchId"]
).reset_index(drop=True)

player_match_df.head()


Unnamed: 0,matchId,batsman,runs_scored,balls_faced,fours,sixes,team,opponent,venue,season,strike_rate
0,548346,A Ashish Reddy,10,10,0,1,Deccan Chargers,Mumbai Indians,Wankhede Stadium,2012,100.0
1,548352,A Ashish Reddy,3,3,0,0,Deccan Chargers,Chennai Super Kings,"MA Chidambaram Stadium, Chepauk",2012,100.0
2,548359,A Ashish Reddy,8,8,1,0,Deccan Chargers,Kings XI Punjab,"Rajiv Gandhi International Stadium, Uppal",2012,100.0
3,548373,A Ashish Reddy,10,4,2,0,Deccan Chargers,Rajasthan Royals,"Rajiv Gandhi International Stadium, Uppal",2012,250.0
4,548376,A Ashish Reddy,4,5,0,0,Deccan Chargers,Royal Challengers Bangalore,"Rajiv Gandhi International Stadium, Uppal",2012,80.0


9 – Player Form (Rolling Average – Last 5 Matches)

In [31]:
player_match_df["runs_last_5"] = (
    player_match_df
    .groupby("batsman")["runs_scored"]
    .transform(lambda x: x.rolling(5, min_periods=1).mean())
)

player_match_df["sr_last_5"] = (
    player_match_df
    .groupby("batsman")["strike_rate"]
    .transform(lambda x: x.rolling(5, min_periods=1).mean())
)

player_match_df.head(10)


Unnamed: 0,matchId,batsman,runs_scored,balls_faced,fours,sixes,team,opponent,venue,season,strike_rate,runs_last_5,sr_last_5
0,548346,A Ashish Reddy,10,10,0,1,Deccan Chargers,Mumbai Indians,Wankhede Stadium,2012,100.0,10.0,100.0
1,548352,A Ashish Reddy,3,3,0,0,Deccan Chargers,Chennai Super Kings,"MA Chidambaram Stadium, Chepauk",2012,100.0,6.5,100.0
2,548359,A Ashish Reddy,8,8,1,0,Deccan Chargers,Kings XI Punjab,"Rajiv Gandhi International Stadium, Uppal",2012,100.0,7.0,100.0
3,548373,A Ashish Reddy,10,4,2,0,Deccan Chargers,Rajasthan Royals,"Rajiv Gandhi International Stadium, Uppal",2012,250.0,7.75,137.5
4,548376,A Ashish Reddy,4,5,0,0,Deccan Chargers,Royal Challengers Bangalore,"Rajiv Gandhi International Stadium, Uppal",2012,80.0,7.0,126.0
5,598000,A Ashish Reddy,7,4,1,0,Sunrisers Hyderabad,Pune Warriors,"Rajiv Gandhi International Stadium, Uppal",2013,175.0,6.4,141.0
6,598004,A Ashish Reddy,14,12,0,1,Sunrisers Hyderabad,Royal Challengers Bangalore,"Rajiv Gandhi International Stadium, Uppal",2013,116.666667,8.6,144.333333
7,598010,A Ashish Reddy,16,9,2,0,Sunrisers Hyderabad,Delhi Daredevils,Feroz Shah Kotla,2013,177.777778,10.2,159.888889
8,598013,A Ashish Reddy,4,5,0,0,Sunrisers Hyderabad,Kolkata Knight Riders,Eden Gardens,2013,80.0,9.0,125.888889
9,598018,A Ashish Reddy,19,15,0,1,Sunrisers Hyderabad,Pune Warriors,Maharashtra Cricket Association Stadium,2013,126.666667,12.0,135.222222


10 – Data Validation

In [32]:
player_match_df.describe()


Unnamed: 0,matchId,runs_scored,balls_faced,fours,sixes,strike_rate,runs_last_5,sr_last_5
count,16515.0,16515.0,16515.0,16515.0,16515.0,16515.0,16515.0,16515.0
mean,907862.6,19.98571,15.798971,1.807448,0.790251,109.237847,20.118102,109.750191
std,369668.6,21.360857,13.934583,2.310542,1.369255,66.104126,12.768341,36.370763
min,335982.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,548331.0,4.0,5.0,0.0,0.0,66.666667,10.2,88.431713
50%,980971.0,13.0,11.0,1.0,0.0,107.692308,18.5,109.270624
75%,1254071.0,29.0,23.0,3.0,1.0,146.153846,28.0,130.808982
max,1426312.0,175.0,77.0,19.0,17.0,600.0,158.0,600.0


11 – Missing Value Check

In [33]:
player_match_df.isnull().sum()


matchId        0
batsman        0
runs_scored    0
balls_faced    0
fours          0
sixes          0
team           0
opponent       0
venue          0
season         0
strike_rate    0
runs_last_5    0
sr_last_5      0
dtype: int64

12 – Save Feature Dataset

In [34]:
output_path = os.path.join(
    FEATURE_DIR, "player_match_features.csv"
)

player_match_df.to_csv(output_path, index=False)

print("Feature engineering completed successfully")
print("Saved at:", output_path)


Feature engineering completed successfully
Saved at: c:\Users\DELL\OneDrive\Desktop\infosys\IPL_EDA_Project\notebooks\..\data\features\player_match_features.csv


## Summary

✔ Converted IPL ball-by-ball data to player-match level  
✔ Engineered rolling form features for each player  
✔ Dataset ready for advanced feature engineering and modeling

13 – Venue Average Runs (Player at Venue)

In [35]:
# Average runs scored by a player at a venue
venue_avg_df = (
    player_match_df
    .groupby(["batsman", "venue"])["runs_scored"]
    .mean()
    .reset_index(name="venue_avg_runs")
)

venue_avg_df.head()


Unnamed: 0,batsman,venue,venue_avg_runs
0,A Ashish Reddy,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket St...,14.0
1,A Ashish Reddy,Eden Gardens,4.0
2,A Ashish Reddy,Feroz Shah Kotla,16.0
3,A Ashish Reddy,M Chinnaswamy Stadium,17.5
4,A Ashish Reddy,"MA Chidambaram Stadium, Chepauk",19.5


14 – Merge Venue Averages into Main Dataset

In [36]:
player_match_df = player_match_df.merge(
    venue_avg_df,
    on=["batsman", "venue"],
    how="left"
)

player_match_df.head()


Unnamed: 0,matchId,batsman,runs_scored,balls_faced,fours,sixes,team,opponent,venue,season,strike_rate,runs_last_5,sr_last_5,venue_avg_runs
0,548346,A Ashish Reddy,10,10,0,1,Deccan Chargers,Mumbai Indians,Wankhede Stadium,2012,100.0,10.0,100.0,10.0
1,548352,A Ashish Reddy,3,3,0,0,Deccan Chargers,Chennai Super Kings,"MA Chidambaram Stadium, Chepauk",2012,100.0,6.5,100.0,19.5
2,548359,A Ashish Reddy,8,8,1,0,Deccan Chargers,Kings XI Punjab,"Rajiv Gandhi International Stadium, Uppal",2012,100.0,7.0,100.0,8.454545
3,548373,A Ashish Reddy,10,4,2,0,Deccan Chargers,Rajasthan Royals,"Rajiv Gandhi International Stadium, Uppal",2012,250.0,7.75,137.5,8.454545
4,548376,A Ashish Reddy,4,5,0,0,Deccan Chargers,Royal Challengers Bangalore,"Rajiv Gandhi International Stadium, Uppal",2012,80.0,7.0,126.0,8.454545


### 15 – Player vs Team (PvT) Stats

In [37]:
pvt_df = (
    player_match_df
    .groupby(["batsman", "opponent"])
    .agg(
        pvt_avg_runs=("runs_scored", "mean"),
        pvt_matches=("matchId", "count")
    )
    .reset_index()
)

pvt_df.head()


Unnamed: 0,batsman,opponent,pvt_avg_runs,pvt_matches
0,A Ashish Reddy,Chennai Super Kings,15.0,3
1,A Ashish Reddy,Delhi Daredevils,12.0,3
2,A Ashish Reddy,Kings XI Punjab,12.333333,3
3,A Ashish Reddy,Kolkata Knight Riders,8.5,2
4,A Ashish Reddy,Mumbai Indians,13.5,2


### 16 – Merge PvT Stats

In [38]:
player_match_df = player_match_df.merge(
    pvt_df,
    on=["batsman", "opponent"],
    how="left"
)

player_match_df.head()


Unnamed: 0,matchId,batsman,runs_scored,balls_faced,fours,sixes,team,opponent,venue,season,strike_rate,runs_last_5,sr_last_5,venue_avg_runs,pvt_avg_runs,pvt_matches
0,548346,A Ashish Reddy,10,10,0,1,Deccan Chargers,Mumbai Indians,Wankhede Stadium,2012,100.0,10.0,100.0,10.0,13.5,2
1,548352,A Ashish Reddy,3,3,0,0,Deccan Chargers,Chennai Super Kings,"MA Chidambaram Stadium, Chepauk",2012,100.0,6.5,100.0,19.5,15.0,3
2,548359,A Ashish Reddy,8,8,1,0,Deccan Chargers,Kings XI Punjab,"Rajiv Gandhi International Stadium, Uppal",2012,100.0,7.0,100.0,8.454545,12.333333,3
3,548373,A Ashish Reddy,10,4,2,0,Deccan Chargers,Rajasthan Royals,"Rajiv Gandhi International Stadium, Uppal",2012,250.0,7.75,137.5,8.454545,12.333333,3
4,548376,A Ashish Reddy,4,5,0,0,Deccan Chargers,Royal Challengers Bangalore,"Rajiv Gandhi International Stadium, Uppal",2012,80.0,7.0,126.0,8.454545,11.0,5


### 17 – Player vs Player (PvP) Raw Aggregation

In [39]:
pvp_raw = (
    ipl_df
    .groupby(["batsman", "bowler"])
    .agg(
        pvp_runs=("batsman_runs", "sum"),
        pvp_balls=("ball", "count")
    )
    .reset_index()
)

pvp_raw.head()


Unnamed: 0,batsman,bowler,pvp_runs,pvp_balls
0,A Ashish Reddy,A Nehra,7,9
1,A Ashish Reddy,AB Dinda,9,7
2,A Ashish Reddy,AD Mathews,25,12
3,A Ashish Reddy,AD Russell,4,3
4,A Ashish Reddy,Anureet Singh,2,2


### 18 – PvP Strike Rate Calculation

In [41]:
pvp_raw["pvp_strike_rate"] = (
    pvp_raw["pvp_runs"] / pvp_raw["pvp_balls"]
) * 100

pvp_raw.head()


Unnamed: 0,batsman,bowler,pvp_runs,pvp_balls,pvp_strike_rate
0,A Ashish Reddy,A Nehra,7,9,77.777778
1,A Ashish Reddy,AB Dinda,9,7,128.571429
2,A Ashish Reddy,AD Mathews,25,12,208.333333
3,A Ashish Reddy,AD Russell,4,3,133.333333
4,A Ashish Reddy,Anureet Singh,2,2,100.0


### 19 – Merge PvP Stats into Player-Match Dataset

In [42]:
pvp_match_avg = (
    ipl_df
    .merge(
        pvp_raw,
        on=["batsman", "bowler"],
        how="left"
    )
    .groupby(["matchId", "batsman"])
    .agg(
        avg_pvp_sr=("pvp_strike_rate", "mean")
    )
    .reset_index()
)

player_match_df = player_match_df.merge(
    pvp_match_avg,
    on=["matchId", "batsman"],
    how="left"
)

player_match_df.head()


Unnamed: 0,matchId,batsman,runs_scored,balls_faced,fours,sixes,team,opponent,venue,season,strike_rate,runs_last_5,sr_last_5,venue_avg_runs,pvt_avg_runs,pvt_matches,avg_pvp_sr
0,548346,A Ashish Reddy,10,10,0,1,Deccan Chargers,Mumbai Indians,Wankhede Stadium,2012,100.0,10.0,100.0,10.0,13.5,2,100.0
1,548352,A Ashish Reddy,3,3,0,0,Deccan Chargers,Chennai Super Kings,"MA Chidambaram Stadium, Chepauk",2012,100.0,6.5,100.0,19.5,15.0,3,128.205128
2,548359,A Ashish Reddy,8,8,1,0,Deccan Chargers,Kings XI Punjab,"Rajiv Gandhi International Stadium, Uppal",2012,100.0,7.0,100.0,8.454545,12.333333,3,100.0
3,548373,A Ashish Reddy,10,4,2,0,Deccan Chargers,Rajasthan Royals,"Rajiv Gandhi International Stadium, Uppal",2012,250.0,7.75,137.5,8.454545,12.333333,3,250.0
4,548376,A Ashish Reddy,4,5,0,0,Deccan Chargers,Royal Challengers Bangalore,"Rajiv Gandhi International Stadium, Uppal",2012,80.0,7.0,126.0,8.454545,11.0,5,127.272727


### 20 – Career Statistics (Till That Match)

In [43]:
player_match_df["career_matches"] = (
    player_match_df
    .groupby("batsman")
    .cumcount() + 1
)

player_match_df["career_avg_runs"] = (
    player_match_df
    .groupby("batsman")["runs_scored"]
    .expanding()
    .mean()
    .reset_index(level=0, drop=True)
)

player_match_df["career_avg_sr"] = (
    player_match_df
    .groupby("batsman")["strike_rate"]
    .expanding()
    .mean()
    .reset_index(level=0, drop=True)
)

player_match_df.head()


Unnamed: 0,matchId,batsman,runs_scored,balls_faced,fours,sixes,team,opponent,venue,season,strike_rate,runs_last_5,sr_last_5,venue_avg_runs,pvt_avg_runs,pvt_matches,avg_pvp_sr,career_matches,career_avg_runs,career_avg_sr
0,548346,A Ashish Reddy,10,10,0,1,Deccan Chargers,Mumbai Indians,Wankhede Stadium,2012,100.0,10.0,100.0,10.0,13.5,2,100.0,1,10.0,100.0
1,548352,A Ashish Reddy,3,3,0,0,Deccan Chargers,Chennai Super Kings,"MA Chidambaram Stadium, Chepauk",2012,100.0,6.5,100.0,19.5,15.0,3,128.205128,2,6.5,100.0
2,548359,A Ashish Reddy,8,8,1,0,Deccan Chargers,Kings XI Punjab,"Rajiv Gandhi International Stadium, Uppal",2012,100.0,7.0,100.0,8.454545,12.333333,3,100.0,3,7.0,100.0
3,548373,A Ashish Reddy,10,4,2,0,Deccan Chargers,Rajasthan Royals,"Rajiv Gandhi International Stadium, Uppal",2012,250.0,7.75,137.5,8.454545,12.333333,3,250.0,4,7.75,137.5
4,548376,A Ashish Reddy,4,5,0,0,Deccan Chargers,Royal Challengers Bangalore,"Rajiv Gandhi International Stadium, Uppal",2012,80.0,7.0,126.0,8.454545,11.0,5,127.272727,5,7.0,126.0


### 21 – Handle Missing Values (Important for ML)

In [44]:
feature_cols = [
    "venue_avg_runs",
    "pvt_avg_runs",
    "avg_pvp_sr",
    "career_avg_runs",
    "career_avg_sr"
]

player_match_df[feature_cols] = player_match_df[feature_cols].fillna(
    player_match_df[feature_cols].median()
)

player_match_df.isnull().sum()


matchId            0
batsman            0
runs_scored        0
balls_faced        0
fours              0
sixes              0
team               0
opponent           0
venue              0
season             0
strike_rate        0
runs_last_5        0
sr_last_5          0
venue_avg_runs     0
pvt_avg_runs       0
pvt_matches        0
avg_pvp_sr         0
career_matches     0
career_avg_runs    0
career_avg_sr      0
dtype: int64

### 22 – Save Final Feature-Engineered Dataset

In [45]:
final_output_path = os.path.join(
    FEATURE_DIR, "player_match_features_full.csv"
)

player_match_df.to_csv(final_output_path, index=False)

print("Advanced feature engineering completed")
print("Saved at:", final_output_path)


Advanced feature engineering completed
Saved at: c:\Users\DELL\OneDrive\Desktop\infosys\IPL_EDA_Project\notebooks\..\data\features\player_match_features_full.csv


## Feature Engineering Summary

✔ Player form (rolling averages)  
✔ Venue-based averages  
✔ Player vs Team (PvT) statistics  
✔ Player vs Player (PvP) strike rate  
✔ Career-level statistics  

Dataset is now fully ready for:
- Label creation
- Time-series train-test split
- Model training


In [59]:
import matplotlib.pyplot as plt
import seaborn as sns

In [60]:
# Base directory = notebooks folder
BASE_DIR = os.getcwd()

# Feature engineering directory inside data/
FEATURE_BASE_DIR = os.path.join(
    BASE_DIR, "..", "data", "feature_engineering"
)

FEATURE_PLOTS_DIR = os.path.join(
    FEATURE_BASE_DIR, "plots"
)

# Create directories
os.makedirs(FEATURE_BASE_DIR, exist_ok=True)
os.makedirs(FEATURE_PLOTS_DIR, exist_ok=True)

# Alias used by plotting cells
PLOTS_DIR = FEATURE_PLOTS_DIR

print("Feature data path:", FEATURE_BASE_DIR)
print("Feature plots path:", FEATURE_PLOTS_DIR)


Feature data path: c:\Users\DELL\OneDrive\Desktop\infosys\IPL_EDA_Project\notebooks\..\data\feature_engineering
Feature plots path: c:\Users\DELL\OneDrive\Desktop\infosys\IPL_EDA_Project\notebooks\..\data\feature_engineering\plots


In [61]:
# Save basic feature set
player_match_df.to_csv(
    os.path.join(FEATURE_BASE_DIR, "player_match_features.csv"),
    index=False
)

print("Saved player_match_features.csv")


Saved player_match_features.csv


In [62]:
player_match_df.to_csv(
    os.path.join(FEATURE_BASE_DIR, "player_match_features_full.csv"),
    index=False
)

print("Saved player_match_features_full.csv")


Saved player_match_features_full.csv


### Visualization 1: Runs Distribution at Player–Match Level

In [64]:
plt.figure(figsize=(8,5))
sns.histplot(player_match_df["runs_scored"], bins=30, color="red")
plt.title("Distribution of Runs per Player per Match")
plt.xlabel("Runs Scored")
plt.ylabel("Frequency")
plt.tight_layout()

plt.savefig(os.path.join(PLOTS_DIR, "runs_distribution_player_match.png"))
plt.close()


### Visualization 2: Top Players by Average Runs

In [65]:
top_players = (
    player_match_df
    .groupby("batsman")["runs_scored"]
    .mean()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()
)

plt.figure(figsize=(10,5))
sns.barplot(
    data=top_players,
    x="runs_scored",
    y="batsman",
    color="red"
)
plt.title("Top 10 Players by Average Runs per Match")
plt.xlabel("Average Runs")
plt.ylabel("Player")
plt.tight_layout()

plt.savefig(os.path.join(PLOTS_DIR, "top_players_avg_runs.png"))
plt.close()


In [66]:
top_players = (
    player_match_df
    .groupby("batsman")["runs_scored"]
    .mean()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()
)

plt.figure(figsize=(10,5))
sns.barplot(
    data=top_players,
    x="runs_scored",
    y="batsman",
    color="red"
)
plt.title("Top 10 Players by Average Runs per Match")
plt.xlabel("Average Runs")
plt.ylabel("Player")
plt.tight_layout()

plt.savefig(os.path.join(PLOTS_DIR, "top_players_avg_runs.png"))
plt.close()


### Visualization 3: Venue Average Runs

In [67]:
player_name = player_match_df["batsman"].value_counts().index[0]

venue_avg_plot = (
    player_match_df[player_match_df["batsman"] == player_name]
    .groupby("venue")["runs_scored"]
    .mean()
    .sort_values(ascending=False)
    .head(8)
    .reset_index()
)

plt.figure(figsize=(10,5))
sns.barplot(
    data=venue_avg_plot,
    x="runs_scored",
    y="venue",
    color="red"
)
plt.title(f"{player_name} – Average Runs by Venue")
plt.xlabel("Average Runs")
plt.ylabel("Venue")
plt.tight_layout()

plt.savefig(os.path.join(PLOTS_DIR, "venue_avg_runs.png"))
plt.close()


### Visualization 4: Player vs Team (PvT) Performance

In [68]:
pvt_plot = (
    player_match_df[player_match_df["batsman"] == player_name]
    .groupby("opponent")["runs_scored"]
    .mean()
    .sort_values(ascending=False)
    .head(8)
    .reset_index()
)

plt.figure(figsize=(10,5))
sns.barplot(
    data=pvt_plot,
    x="runs_scored",
    y="opponent",
    color="red"
)
plt.title(f"{player_name} – Average Runs vs Teams")
plt.xlabel("Average Runs")
plt.ylabel("Opponent Team")
plt.tight_layout()

plt.savefig(os.path.join(PLOTS_DIR, "player_vs_team_avg_runs.png"))
plt.close()


### Visualization 5: Player Form Trend (Rolling Average)

In [69]:
player_form = (
    player_match_df[player_match_df["batsman"] == player_name]
    .sort_values("matchId")
)

plt.figure(figsize=(12,5))
plt.plot(
    player_form["runs_scored"],
    label="Runs Scored",
    alpha=0.6
)
plt.plot(
    player_form["runs_last_5"],
    label="Rolling Avg (Last 5)",
    linewidth=2
)

plt.title(f"{player_name} – Player Form Across Matches")
plt.xlabel("Match Index")
plt.ylabel("Runs")
plt.legend()
plt.tight_layout()

plt.savefig(os.path.join(PLOTS_DIR, "player_form_rolling_avg.png"))
plt.close()


### PvP: Top Bowlers Faced by a Player (by Balls)

- Shows which bowlers a batsman has faced the most

In [70]:
player_name = player_match_df["batsman"].value_counts().index[0]

pvp_balls_plot = (
    ipl_df[ipl_df["batsman"] == player_name]
    .groupby("bowler")["ball"]
    .count()
    .sort_values(ascending=False)
    .head(10)
    .reset_index(name="balls_faced")
)

plt.figure(figsize=(10,5))
sns.barplot(
    data=pvp_balls_plot,
    x="balls_faced",
    y="bowler",
    color="red"
)
plt.title(f"{player_name} – Most Faced Bowlers")
plt.xlabel("Balls Faced")
plt.ylabel("Bowler")
plt.tight_layout()

plt.savefig(os.path.join(PLOTS_DIR, "pvp_most_faced_bowlers.png"))
plt.close()


### PvP: Strike Rate vs Top Bowlers

- Shows how effectively a batsman scores against specific bowlers

In [71]:
pvp_sr_plot = (
    pvp_raw[pvp_raw["batsman"] == player_name]
    .sort_values("pvp_balls", ascending=False)
    .head(8)
)

plt.figure(figsize=(10,5))
sns.barplot(
    data=pvp_sr_plot,
    x="pvp_strike_rate",
    y="bowler",
    color="red"
)
plt.title(f"{player_name} – Strike Rate vs Bowlers")
plt.xlabel("Strike Rate")
plt.ylabel("Bowler")
plt.tight_layout()

plt.savefig(os.path.join(PLOTS_DIR, "pvp_strike_rate_vs_bowlers.png"))
plt.close()


### PvP: Distribution of PvP Strike Rates

- Shows variability of player–bowler interactions

In [72]:
plt.figure(figsize=(8,5))
sns.histplot(
    pvp_raw["pvp_strike_rate"],
    bins=30,
    color="red"
)
plt.title("Distribution of PvP Strike Rates")
plt.xlabel("PvP Strike Rate")
plt.ylabel("Frequency")
plt.tight_layout()

plt.savefig(os.path.join(PLOTS_DIR, "pvp_strike_rate_distribution.png"))
plt.close()
