In [1]:
import pandas as pd
import numpy as np

balls = pd.read_csv("Downloads/IPL_Project/deliveries.csv")
matches = pd.read_csv("Downloads/IPL_Project/matches.csv")


In [2]:
# Merging (joining) the 'balls' DataFrame with selected columns from 'matches'
df = balls.merge(
    matches[['id','match_date','season','team1','team2','venue']],
    on='id',
    how='left'
)


In [3]:
# Converting match_date column to proper datetime format (day-first format)
df['match_date'] = pd.to_datetime(df['match_date'], dayfirst=True, errors='coerce')


In [4]:
# Deriving bowling_team based on which team is not batting in that match
df['bowling_team'] = np.where(
    df['batting_team'] == df['team1'],
    df['team2'],
    df['team1']
)


In [5]:
# Calculating average runs scored by each batter against each bowling team
pvt = df.groupby(['batter','bowling_team'])['batsman_run'].mean().reset_index()

# Renaming the aggregated column to a meaningful name
pvt.rename(columns={'batsman_run':'pvt_avg'}, inplace=True)


In [6]:
# Calculating average runs scored by each batter against each bowler
pvp = df.groupby(['batter','bowler'])['batsman_run'].mean().reset_index()

# Renaming the aggregated column to a meaningful name
pvp.rename(columns={'batsman_run':'pvp_avg'}, inplace=True)


In [7]:
# Aggregating total runs scored by each batter in each match (with season, venue, opponent info)
batter_df = df.groupby(
    ['id','batter','season','match_date','venue','bowling_team']
)['batsman_run'].sum().reset_index()

# Renaming batsman_run to runs for better clarity
batter_df.rename(columns={'batsman_run':'runs'}, inplace=True)


In [8]:
batter_df = batter_df.merge(pvt, on=['batter','bowling_team'], how='left')
batter_df =# Adding batter vs team average (pvt_avg) to the match-level batter stats
batter_df = batter_df.merge(pvt, on=['batter','bowling_team'], how='left')

# Adding batter vs bowler average (pvp_avg) to the match-level batter stats
batter_df = batter_df.merge(pvp, on='batter', how='left')
 batter_df.merge(pvp, on='batter', how='left')


In [9]:
# Calculating average runs scored by each batter at each venue
venue_bat = df.groupby(['batter','venue'])['batsman_run'].mean().reset_index()
venue_bat.rename(columns={'batsman_run':'venue_avg'}, inplace=True)

# Merging venue-based batting averages into batter_df
batter_df = batter_df.merge(venue_bat, on=['batter','venue'], how='left')


In [10]:
# Calculating overall career average runs for each batter
career_bat = df.groupby('batter')['batsman_run'].mean().reset_index()
career_bat.rename(columns={'batsman_run':'career_avg'}, inplace=True)

# Merging career batting averages into batter_df
batter_df = batter_df.merge(career_bat, on='batter', how='left')


In [11]:
# Sorting match-level batting data chronologically for each batter
batter_df = batter_df.sort_values(['batter','match_date'])

# Computing rolling (last 5 matches) batting average for each batter
batter_df['rolling_avg_5'] = (
    batter_df.groupby('batter')['runs']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(0, drop=True)
)


In [12]:
# Shifting runs column to get the runs scored in the next match (used as target label for prediction)
batter_df['next_match_runs'] = batter_df.groupby('batter')['runs'].shift(-1)


In [13]:
# Checking the data type of the 'season' column
batter_df['season'].dtype


dtype('O')

In [14]:
# Converting season to a 4-digit year (e.g., "2017" from strings like "2017/18") and casting it to integer
batter_df['season'] = batter_df['season'].astype(str).str[:4].astype(int)


In [15]:
# Splitting data into training (seasons before 2023) and testing (season 2023) sets
train_bat = batter_df[batter_df['season'] < 2023]
test_bat = batter_df[batter_df['season'] == 2023]


In [16]:
# Selecting relevant feature columns for predicting batter performance
bat_features = [
    'rolling_avg_5',
    'venue_avg',
    'pvt_avg',
    'pvp_avg',
    'career_avg'
]


In [17]:
# Exporting the prepared batter dataset to a CSV file for further analysis or modeling
batter_df.to_csv("batter_dataset.csv", index=False)


In [18]:
# Aggregating total wickets taken by each bowler in each match (with season, venue, opponent info)
bowler_df = df.groupby(
    ['id','bowler','season','match_date','venue','batting_team']
)['iswicket_delivery'].sum().reset_index()

# Renaming the aggregated column to a clearer name
bowler_df.rename(columns={'iswicket_delivery': 'wickets'}, inplace=True)


In [19]:
# Sorting match-level bowling data chronologically for each bowler
bowler_df = bowler_df.sort_values(['bowler','match_date'])

# Computing rolling (last 5 matches) wicket average for each bowler
bowler_df['rolling_wkt_5'] = (
    bowler_df.groupby('bowler')['wickets']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(0, drop=True)
)


In [20]:
# Calculating average wickets taken by each bowler at each venue
venue_bowl = df.groupby(['bowler','venue'])['iswicket_delivery'].mean().reset_index()
venue_bowl.rename(columns={'iswicket_delivery':'venue_avg'}, inplace=True)

# Merging venue-based bowling averages into bowler_df
bowler_df = bowler_df.merge(venue_bowl, on=['bowler','venue'], how='left')


In [21]:
# Calculating total career wickets taken by each bowler
career_bowl = df.groupby('bowler')['iswicket_delivery'].sum().reset_index()
career_bowl.rename(columns={'iswicket_delivery':'career_wkts'}, inplace=True)

# Merging career wicket totals into bowler_df
bowler_df = bowler_df.merge(career_bowl, on='bowler', how='left')


In [22]:
# Shifting wickets to get wickets taken in the next match (target for prediction),
# and clipping values to a max of 1 to treat wicket outcome as binary
bowler_df['next_match_wickets'] = (
    bowler_df.groupby('bowler')['wickets']
    .shift(-1)
    .clip(upper=1)
)


In [23]:
# Exporting the prepared bowler dataset to a CSV file for further analysis or modeling
bowler_df.to_csv("bowler_dataset.csv", index=False)


In [24]:
# 1. MATCH-LEVEL WICKET AGGREGATION
# Summing wickets taken by each bowler per match (with season, venue & opponent info)
bowler_df = df.groupby(
    ['id','bowler','season','match_date','venue','batting_team']
)['iswicket_delivery'].sum().reset_index()
bowler_df.rename(columns={'iswicket_delivery':'wickets'}, inplace=True)

# 2. ROLLING FORM (LAST 5 MATCHES)
# Capturing recent bowling performance trend using rolling mean
bowler_df = bowler_df.sort_values(['bowler','match_date'])
bowler_df['rolling_wkt_5'] = (
    bowler_df.groupby('bowler')['wickets']
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(0, drop=True)
)

# 3. VENUE AVERAGE PERFORMANCE
# Average wickets taken by bowler at each venue
venue_bowl = df.groupby(['bowler','venue'])['iswicket_delivery'].mean().reset_index()
venue_bowl.rename(columns={'iswicket_delivery':'venue_avg'}, inplace=True)
bowler_df = bowler_df.merge(venue_bowl, on=['bowler','venue'], how='left')

# 4. CAREER PERFORMANCE
# Total career wickets for each bowler (global strength feature)
career_bowl = df.groupby('bowler')['iswicket_delivery'].sum().reset_index()
career_bowl.rename(columns={'iswicket_delivery':'career_wkts'}, inplace=True)
bowler_df = bowler_df.merge(career_bowl, on='bowler', how='left')

# 5. LABEL (NEXT MATCH WICKET - BINARY CLASSIFICATION)
# Next-match wicket outcome (clipped to 0/1)
bowler_df['next_match_wickets'] = (
    bowler_df.groupby('bowler')['wickets']
    .shift(-1)
    .clip(upper=1)
)

# 6. EXPORT DATASET
bowler_df.to_csv("bowler_dataset.csv", index=False)
print("bowler_dataset.csv saved!")


bowler_dataset.csv saved!


In [25]:
bowler_df.head()


Unnamed: 0,id,bowler,season,match_date,venue,batting_team,wickets,rolling_wkt_5,venue_avg,career_wkts,next_match_wickets
0,548341,A Ashish Reddy,2012,2012-04-26,Subrata Roy Sahara Stadium,Pune Warriors,2,2.0,0.083333,19,1.0
1,548346,A Ashish Reddy,2012,2012-04-29,Wankhede Stadium,Mumbai Indians,1,1.5,0.071429,19,1.0
2,548348,A Ashish Reddy,2012,2012-05-01,Barabati Stadium,Pune Warriors,1,1.333333,0.052632,19,1.0
3,548352,A Ashish Reddy,2012,2012-05-04,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,1,1.25,0.055556,19,1.0
4,548356,A Ashish Reddy,2012,2012-05-06,M Chinnaswamy Stadium,Royal Challengers Bangalore,1,1.2,0.041667,19,1.0


In [26]:
# Converting season to 4-digit year format (e.g., "2018/19" â†’ 2018) for both batter & bowler data
batter_df['season'] = batter_df['season'].astype(str).str[:4].astype(int)
bowler_df['season'] = bowler_df['season'].astype(str).str[:4].astype(int)

# Splitting batter dataset into training (<2023) and testing (2023)
train_bat = batter_df[batter_df['season'] < 2023]
test_bat = batter_df[batter_df['season'] == 2023]

# Splitting bowler dataset into training (<2023) and testing (2023)
train_bowl = bowler_df[bowler_df['season'] < 2023]
test_bowl = bowler_df[bowler_df['season'] == 2023]


In [29]:
# Normalizing season field to a 4-digit integer year for both batter and bowler datasets
batter_df['season'] = batter_df['season'].astype(str).str[:4].astype(int)
bowler_df['season'] = bowler_df['season'].astype(str).str[:4].astype(int)


In [30]:
# Splitting batter dataset into training (before 2023) and testing (2023 season)
train_bat = batter_df[batter_df['season'] < 2023]
test_bat  = batter_df[batter_df['season'] == 2023]

# Splitting bowler dataset into training (before 2023) and testing (2023 season)
train_bowl = bowler_df[bowler_df['season'] < 2023]
test_bowl  = bowler_df[bowler_df['season'] == 2023]


In [31]:
# Displaying the seasons present in train vs test splits for validation
print("Batter Train Seasons:", sorted(train_bat['season'].unique()))
print("Batter Test Seasons:", sorted(test_bat['season'].unique()))
print("Bowler Train Seasons:", sorted(train_bowl['season'].unique()))
print("Bowler Test Seasons:", sorted(test_bowl['season'].unique()))


Batter Train Seasons: [np.int64(2007), np.int64(2009), np.int64(2011), np.int64(2012), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022)]
Batter Test Seasons: [np.int64(2023)]
Bowler Train Seasons: [np.int64(2007), np.int64(2009), np.int64(2011), np.int64(2012), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022)]
Bowler Test Seasons: [np.int64(2023)]


In [32]:
# Selecting key feature columns for batter performance prediction
bat_features = [
    'rolling_avg_5',
    'venue_avg',
    'pvt_avg',
    'pvp_avg',
    'career_avg'
]


In [33]:
# Selecting key feature columns for bowler wicket prediction
bowl_features = [
    'rolling_wkt_5',
    'venue_avg',
    'career_wkts'
]


In [34]:
from sklearn.preprocessing import StandardScaler
from sklearn.p# Importing scaling and pipeline utilities for ML preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# For saving trained ML models to disk
import joblib
ipeline import Pipeline
import joblib


In [35]:
# Creating a preprocessing pipeline for batter features (standard scaling)
pipeline_bat = Pipeline([
    ('scaler', StandardScaler())
])

# Fitting the pipeline on training batter features to learn scaling parameters
pipeline_bat.fit(train_bat[bat_features])

# Saving the fitted pipeline for re-use (e.g., during inference or deployment)
joblib.dump(pipeline_bat, "feature_pipeline_batter.pkl")
print("batter pipeline saved")


batter pipeline saved


In [36]:
# Creating preprocessing pipeline for bowler features (standard scaling)
pipeline_bowl = Pipeline([
    ('scaler', StandardScaler())
])

# Fitting the pipeline on training bowler features
pipeline_bowl.fit(train_bowl[bowl_features])

# Saving the fitted pipeline for future inference/deployment
joblib.dump(pipeline_bowl, "feature_pipeline_bowler.pkl")
print("bowler pipeline saved")


bowler pipeline saved


In [37]:
import os
print(os.getcwd())


C:\Users\ANNU TIWARI
