In [1]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
ipl = pd.read_csv("../data/cleaned/ipl_cleaned.csv")

In [None]:
df = ipl.groupby(['match_id','batter','bowler','date','venue','bowling_team']).agg({
    'batsman_runs':'sum',
    'is_wicket':'sum'
}).reset_index()

df.head(5)

Unnamed: 0,match_id,batter,bowler,date,venue,bowling_team,batsman_runs,is_wicket
0,335982,AA Noffke,AB Agarkar,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,2,0
1,335982,AA Noffke,SC Ganguly,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,7,1
2,335982,B Akhil,AB Agarkar,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,0,1
3,335982,BB McCullum,AA Noffke,2008-04-18,M Chinnaswamy Stadium,Royal Challengers Bangalore,24,0
4,335982,BB McCullum,CL White,2008-04-18,M Chinnaswamy Stadium,Royal Challengers Bangalore,16,0


In [4]:
# Sort data by batter and date to ensure temporal ordering for all time-based features
df = df.sort_values(['batter','date']).reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,match_id,batter,bowler,date,venue,bowling_team,batsman_runs,is_wicket
0,548346,A Ashish Reddy,JEC Franklin,2012-04-29,Wankhede Stadium,Mumbai Indians,3,0
1,548346,A Ashish Reddy,MM Patel,2012-04-29,Wankhede Stadium,Mumbai Indians,0,1
2,548346,A Ashish Reddy,RJ Peterson,2012-04-29,Wankhede Stadium,Mumbai Indians,7,0
3,548352,A Ashish Reddy,BW Hilfenhaus,2012-05-04,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,2,1
4,548352,A Ashish Reddy,DJ Bravo,2012-05-04,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,1,0


##### Calculation of rolling average


In [None]:
ipl =  ipl.sort_values(['batter','match_id','over','ball'])

df['rolling_avg_5'] = (
    df.groupby('batter')['batsman_runs']
      .rolling(window=5, min_periods=1)
      .mean()
      .shift(1)
      .reset_index(level=0, drop=True)
)   

##### Calculation of venue average


In [7]:

df['venue_avg'] = (
    df.groupby(['batter', 'venue'])['batsman_runs']
      .expanding()
      .mean()
      .shift(1)
      .reset_index(level=[0,1], drop=True)
)

df['venue_avg'] = df['venue_avg'].fillna(0)

print(df.head(5))

   match_id          batter         bowler        date  \
0    548346  A Ashish Reddy   JEC Franklin  2012-04-29   
1    548346  A Ashish Reddy       MM Patel  2012-04-29   
2    548346  A Ashish Reddy    RJ Peterson  2012-04-29   
3    548352  A Ashish Reddy  BW Hilfenhaus  2012-05-04   
4    548352  A Ashish Reddy       DJ Bravo  2012-05-04   

                             venue         bowling_team  batsman_runs  \
0                 Wankhede Stadium       Mumbai Indians             3   
1                 Wankhede Stadium       Mumbai Indians             0   
2                 Wankhede Stadium       Mumbai Indians             7   
3  MA Chidambaram Stadium, Chepauk  Chennai Super Kings             2   
4  MA Chidambaram Stadium, Chepauk  Chennai Super Kings             1   

   is_wicket  rolling_avg_5  venue_avg  
0          0            NaN   4.666667  
1          1       3.000000   3.000000  
2          0       1.500000   1.500000  
3          1       3.333333   7.000000  
4      

##### Calculation of player-to-team average


In [8]:
df['pvt_avg'] = (
    df.groupby(['batter', 'bowling_team'])['batsman_runs']
      .expanding()
      .mean()
      .shift(1)
      .reset_index(level=[0,1], drop=True)
)

df['pvt_avg'] = df['pvt_avg'].fillna(0)
print(df.head(5))

   match_id          batter         bowler        date  \
0    548346  A Ashish Reddy   JEC Franklin  2012-04-29   
1    548346  A Ashish Reddy       MM Patel  2012-04-29   
2    548346  A Ashish Reddy    RJ Peterson  2012-04-29   
3    548352  A Ashish Reddy  BW Hilfenhaus  2012-05-04   
4    548352  A Ashish Reddy       DJ Bravo  2012-05-04   

                             venue         bowling_team  batsman_runs  \
0                 Wankhede Stadium       Mumbai Indians             3   
1                 Wankhede Stadium       Mumbai Indians             0   
2                 Wankhede Stadium       Mumbai Indians             7   
3  MA Chidambaram Stadium, Chepauk  Chennai Super Kings             2   
4  MA Chidambaram Stadium, Chepauk  Chennai Super Kings             1   

   is_wicket  rolling_avg_5  venue_avg  pvt_avg  
0          0            NaN   4.666667      3.4  
1          1       3.000000   3.000000      3.0  
2          0       1.500000   1.500000      1.5  
3          1

##### Calculation of career avg

In [9]:
df['career_avg'] = df.groupby('batter')['batsman_runs'] \
                     .expanding() \
                     .mean() \
                     .shift(1) \
                     .reset_index(level=0, drop=True)

# Handle NaN values from shift operation (first match for each player)
df['career_avg'] = df['career_avg'].fillna(0)

print(df.head(5))

   match_id          batter         bowler        date  \
0    548346  A Ashish Reddy   JEC Franklin  2012-04-29   
1    548346  A Ashish Reddy       MM Patel  2012-04-29   
2    548346  A Ashish Reddy    RJ Peterson  2012-04-29   
3    548352  A Ashish Reddy  BW Hilfenhaus  2012-05-04   
4    548352  A Ashish Reddy       DJ Bravo  2012-05-04   

                             venue         bowling_team  batsman_runs  \
0                 Wankhede Stadium       Mumbai Indians             3   
1                 Wankhede Stadium       Mumbai Indians             0   
2                 Wankhede Stadium       Mumbai Indians             7   
3  MA Chidambaram Stadium, Chepauk  Chennai Super Kings             2   
4  MA Chidambaram Stadium, Chepauk  Chennai Super Kings             1   

   is_wicket  rolling_avg_5  venue_avg  pvt_avg  career_avg  
0          0            NaN   4.666667      3.4    0.000000  
1          1       3.000000   3.000000      3.0    3.000000  
2          0       1.50000

In [10]:
# first compute at ball level
ipl['pvp_avg_ball'] = (
    ipl.groupby(['batter', 'bowler'])['batsman_runs']
       .expanding()
       .mean()
       .shift(1)
       .reset_index(level=[0,1], drop=True)
)

ipl['pvp_avg_ball'] = ipl['pvp_avg_ball'].fillna(0)

# convert to match level
pvp_match = (
    ipl.groupby(['batter', 'match_id'])['pvp_avg_ball']
       .mean()
       .reset_index()
)

# Remove column if it exists before merging
if 'pvp_avg' in df.columns:
    df = df.drop(columns=['pvp_avg'])
if 'pvp_avg_ball' in df.columns:
    df = df.drop(columns=['pvp_avg_ball'])


# merge into df
df = df.merge(pvp_match, on=['batter','match_id'], how='left')
df.rename(columns={'pvp_avg_ball':'pvp_avg'}, inplace=True)

print(df.head(5))

   match_id          batter         bowler        date  \
0    548346  A Ashish Reddy   JEC Franklin  2012-04-29   
1    548346  A Ashish Reddy       MM Patel  2012-04-29   
2    548346  A Ashish Reddy    RJ Peterson  2012-04-29   
3    548352  A Ashish Reddy  BW Hilfenhaus  2012-05-04   
4    548352  A Ashish Reddy       DJ Bravo  2012-05-04   

                             venue         bowling_team  batsman_runs  \
0                 Wankhede Stadium       Mumbai Indians             3   
1                 Wankhede Stadium       Mumbai Indians             0   
2                 Wankhede Stadium       Mumbai Indians             7   
3  MA Chidambaram Stadium, Chepauk  Chennai Super Kings             2   
4  MA Chidambaram Stadium, Chepauk  Chennai Super Kings             1   

   is_wicket  rolling_avg_5  venue_avg  pvt_avg  career_avg   pvp_avg  
0          0            NaN   4.666667      3.4    0.000000  0.685606  
1          1       3.000000   3.000000      3.0    3.000000  0.68560

##### Target value(**)


In [None]:
df['next_match_runs'] = df.groupby('batter')['batsman_runs'].shift(-1)
df = df.dropna()
df.head(10)

Unnamed: 0,match_id,batter,bowler,date,venue,bowling_team,batsman_runs,is_wicket,rolling_avg_5,venue_avg,pvt_avg,career_avg,pvp_avg,next_match_runs
1,548346,A Ashish Reddy,MM Patel,2012-04-29,Wankhede Stadium,Mumbai Indians,0,1,3.0,3.0,3.0,3.0,0.685606,7.0
2,548346,A Ashish Reddy,RJ Peterson,2012-04-29,Wankhede Stadium,Mumbai Indians,7,0,1.5,1.5,1.5,1.5,0.685606,2.0
3,548352,A Ashish Reddy,BW Hilfenhaus,2012-05-04,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,2,1,3.333333,7.0,0.0,3.333333,1.733333,1.0
4,548352,A Ashish Reddy,DJ Bravo,2012-05-04,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,1,0,3.0,2.0,2.0,3.0,1.733333,0.0
5,548359,A Ashish Reddy,P Awana,2012-05-08,"Rajiv Gandhi International Stadium, Uppal",Kings XI Punjab,0,1,2.6,7.333333,4.5,2.6,0.708333,6.0
6,548359,A Ashish Reddy,P Kumar,2012-05-08,"Rajiv Gandhi International Stadium, Uppal",Kings XI Punjab,6,0,2.0,0.0,0.0,2.166667,0.708333,2.0
7,548359,A Ashish Reddy,PP Chawla,2012-05-08,"Rajiv Gandhi International Stadium, Uppal",Kings XI Punjab,2,0,3.2,3.0,3.0,2.714286,0.708333,10.0
8,548373,A Ashish Reddy,SW Tait,2012-05-18,"Rajiv Gandhi International Stadium, Uppal",Rajasthan Royals,10,0,2.2,2.666667,5.2,2.625,1.5,4.0
9,548376,A Ashish Reddy,R Vinay Kumar,2012-05-20,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,4,1,3.8,4.5,5.285714,3.444444,1.4,6.0
10,598000,A Ashish Reddy,AB Dinda,2013-04-05,"Rajiv Gandhi International Stadium, Uppal",Pune Warriors,6,0,4.4,4.4,4.5,3.5,1.236111,1.0


In [12]:
df['next_match_wicket'] = df.groupby('bowler')['is_wicket'].shift(-1)
df = df.dropna()

##### Feature Selection for the ML model (**)


In [13]:
features = df[['rolling_avg_5', 'venue_avg', 'pvt_avg', 'pvp_avg', 'career_avg']]
labels   = df[['next_match_runs','next_match_wicket']]
print(df.head(2))

   match_id          batter       bowler        date             venue  \
1    548346  A Ashish Reddy     MM Patel  2012-04-29  Wankhede Stadium   
2    548346  A Ashish Reddy  RJ Peterson  2012-04-29  Wankhede Stadium   

     bowling_team  batsman_runs  is_wicket  rolling_avg_5  venue_avg  pvt_avg  \
1  Mumbai Indians             0          1            3.0        3.0      3.0   
2  Mumbai Indians             7          0            1.5        1.5      1.5   

   career_avg   pvp_avg  next_match_runs  next_match_wicket  
1         3.0  0.685606              7.0                0.0  
2         1.5  0.685606              2.0                1.0  


In [14]:
df = df.sort_values('match_id').reset_index(drop=True)

split = int(len(df)*0.8)
X_train, X_test = features[:split], features[split:]
y_train, y_test = labels[:split], labels[split:]

In [15]:
pipeline = Pipeline([
    ('scaler', StandardScaler())
])

pipeline.fit(X_train)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('scaler', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"copy  copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.",True
,"with_mean  with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.",True
,"with_std  with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).",True


In [16]:
joblib.dump(pipeline, "../scripts/feature_pipeline.pkl")


['../scripts/feature_pipeline.pkl']

In [None]:
final_df = df[
    ['batter','date','venue','bowling_team','bowler',
     'rolling_avg_5','venue_avg','pvt_avg','pvp_avg','career_avg',
     'next_match_runs','next_match_wicket']
]

In [18]:
final_df.to_csv("../data/cleaned/dataset.csv", index=False)