In [1]:
#  importing required libraries 
#import pickle  #to save trained model 
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler   #standardize features to improve its performance
from sklearn.pipeline import Pipeline

In [2]:
#ipl = pd.read_csv("data/cleaned/ipl_master.csv")
ipl = pd.read_csv(
    "data/cleaned/ipl_master.csv",
    low_memory=False
)

In [3]:
print(ipl)

        match_id  inning           batting_team                 bowling_team  \
0         335982       1  Kolkata Knight Riders  Royal Challengers Bangalore   
1         335982       1  Kolkata Knight Riders  Royal Challengers Bangalore   
2         335982       1  Kolkata Knight Riders  Royal Challengers Bangalore   
3         335982       1  Kolkata Knight Riders  Royal Challengers Bangalore   
4         335982       1  Kolkata Knight Riders  Royal Challengers Bangalore   
...          ...     ...                    ...                          ...   
260915   1426312       2  Kolkata Knight Riders          Sunrisers Hyderabad   
260916   1426312       2  Kolkata Knight Riders          Sunrisers Hyderabad   
260917   1426312       2  Kolkata Knight Riders          Sunrisers Hyderabad   
260918   1426312       2  Kolkata Knight Riders          Sunrisers Hyderabad   
260919   1426312       2  Kolkata Knight Riders          Sunrisers Hyderabad   

        over  ball       batter        

In [4]:
#____player_match_aggregation
df = ipl.groupby(['batter','match_id','venue','bowler','bowling_team','date','over','batting_team','total_runs']).agg({'batsman_runs':'sum','is_wicket':'sum'}).reset_index()

In [5]:
#To Sort data in chronological order
df = df.sort_values(['batter','date'])

In [6]:
print(df)

                batter  match_id             venue           bowler  \
0       A Ashish Reddy    548346  Wankhede Stadium     JEC Franklin   
1       A Ashish Reddy    548346  Wankhede Stadium     JEC Franklin   
2       A Ashish Reddy    548346  Wankhede Stadium     JEC Franklin   
3       A Ashish Reddy    548346  Wankhede Stadium         MM Patel   
4       A Ashish Reddy    548346  Wankhede Stadium      RJ Peterson   
...                ...       ...               ...              ...   
178508          Z Khan   1082635  Feroz Shah Kotla        HH Pandya   
178509          Z Khan   1082635  Feroz Shah Kotla  Harbhajan Singh   
178510          Z Khan   1082635  Feroz Shah Kotla        KV Sharma   
178511          Z Khan   1082635  Feroz Shah Kotla        KV Sharma   
178512          Z Khan   1082646  Feroz Shah Kotla           P Negi   

                       bowling_team        date  over      batting_team  \
0                    Mumbai Indians  2012-04-29    14   Deccan Chargers 

In [7]:
#______rolling averages (form)feature
df['rolling_avg_5']=df.groupby('batter')['batsman_runs'].rolling(5).mean().reset_index(0,drop=True)
print(df.head())                                                          

           batter  match_id             venue        bowler    bowling_team  \
0  A Ashish Reddy    548346  Wankhede Stadium  JEC Franklin  Mumbai Indians   
1  A Ashish Reddy    548346  Wankhede Stadium  JEC Franklin  Mumbai Indians   
2  A Ashish Reddy    548346  Wankhede Stadium  JEC Franklin  Mumbai Indians   
3  A Ashish Reddy    548346  Wankhede Stadium      MM Patel  Mumbai Indians   
4  A Ashish Reddy    548346  Wankhede Stadium   RJ Peterson  Mumbai Indians   

         date  over     batting_team  total_runs  batsman_runs  is_wicket  \
0  2012-04-29    14  Deccan Chargers           0             0          0   
1  2012-04-29    14  Deccan Chargers           1             1          0   
2  2012-04-29    14  Deccan Chargers           2             2          0   
3  2012-04-29    15  Deccan Chargers           0             0          1   
4  2012-04-29    13  Deccan Chargers           0             0          0   

   rolling_avg_5  
0            NaN  
1            NaN  
2    

In [8]:
#______rolling averages (form)feature
df['rolling_avg_wickets']=(
    df.groupby('bowler')['is_wicket'].rolling(window=5, min_periods=1).mean().reset_index(0,drop=True)
)
print(df[['date','match_id','bowler','rolling_avg_wickets']])                                                        

              date  match_id           bowler  rolling_avg_wickets
0       2012-04-29    548346     JEC Franklin                  0.0
1       2012-04-29    548346     JEC Franklin                  0.0
2       2012-04-29    548346     JEC Franklin                  0.0
3       2012-04-29    548346         MM Patel                  1.0
4       2012-04-29    548346      RJ Peterson                  0.0
...            ...       ...              ...                  ...
178508  2017-05-06   1082635        HH Pandya                  0.0
178509  2017-05-06   1082635  Harbhajan Singh                  0.0
178510  2017-05-06   1082635        KV Sharma                  0.0
178511  2017-05-06   1082635        KV Sharma                  0.2
178512  2017-05-14   1082646           P Negi                  0.2

[178513 rows x 4 columns]


In [9]:
df['overs_bowled_last5']=df.groupby('bowler')['over'].rolling(5).count().reset_index(0, drop=True)/6
print(df['overs_bowled_last5'])

0              NaN
1              NaN
2              NaN
3              NaN
4              NaN
            ...   
178508    0.833333
178509    0.833333
178510    0.833333
178511    0.833333
178512    0.833333
Name: overs_bowled_last5, Length: 178513, dtype: float64


In [10]:
#____________venue averages
df['venue_avg']=df.groupby('batter')['batsman_runs'].transform('mean')
print(df)
                

                batter  match_id             venue           bowler  \
0       A Ashish Reddy    548346  Wankhede Stadium     JEC Franklin   
1       A Ashish Reddy    548346  Wankhede Stadium     JEC Franklin   
2       A Ashish Reddy    548346  Wankhede Stadium     JEC Franklin   
3       A Ashish Reddy    548346  Wankhede Stadium         MM Patel   
4       A Ashish Reddy    548346  Wankhede Stadium      RJ Peterson   
...                ...       ...               ...              ...   
178508          Z Khan   1082635  Feroz Shah Kotla        HH Pandya   
178509          Z Khan   1082635  Feroz Shah Kotla  Harbhajan Singh   
178510          Z Khan   1082635  Feroz Shah Kotla        KV Sharma   
178511          Z Khan   1082635  Feroz Shah Kotla        KV Sharma   
178512          Z Khan   1082646  Feroz Shah Kotla           P Negi   

                       bowling_team        date  over      batting_team  \
0                    Mumbai Indians  2012-04-29    14   Deccan Chargers 

In [11]:
df['venue_avg_wickets']=(
    df.groupby(['bowler','venue'])['is_wicket'].transform('mean')
)
print(df[['bowler','venue','venue_avg_wickets']])

                 bowler             venue  venue_avg_wickets
0          JEC Franklin  Wankhede Stadium           0.071429
1          JEC Franklin  Wankhede Stadium           0.071429
2          JEC Franklin  Wankhede Stadium           0.071429
3              MM Patel  Wankhede Stadium           0.089552
4           RJ Peterson  Wankhede Stadium           0.000000
...                 ...               ...                ...
178508        HH Pandya  Feroz Shah Kotla           0.066667
178509  Harbhajan Singh  Feroz Shah Kotla           0.068966
178510        KV Sharma  Feroz Shah Kotla           0.105263
178511        KV Sharma  Feroz Shah Kotla           0.105263
178512           P Negi  Feroz Shah Kotla           0.108696

[178513 rows x 3 columns]


In [12]:
#_____________opponent-specific stats(Player vs Team)
#df['pvt_avg'] = df.groupby(['batter','bowler'])['batsman_runs'].transform('mean')

#___shift(1) avoids data leakage____

df['pvt_avg'] = df.groupby(['batter','bowler'])['batsman_runs']\
                           .expanding().mean().shift(1)\
                           .reset_index(level=[0,1], drop=True)
print(df)

                batter  match_id             venue           bowler  \
0       A Ashish Reddy    548346  Wankhede Stadium     JEC Franklin   
1       A Ashish Reddy    548346  Wankhede Stadium     JEC Franklin   
2       A Ashish Reddy    548346  Wankhede Stadium     JEC Franklin   
3       A Ashish Reddy    548346  Wankhede Stadium         MM Patel   
4       A Ashish Reddy    548346  Wankhede Stadium      RJ Peterson   
...                ...       ...               ...              ...   
178508          Z Khan   1082635  Feroz Shah Kotla        HH Pandya   
178509          Z Khan   1082635  Feroz Shah Kotla  Harbhajan Singh   
178510          Z Khan   1082635  Feroz Shah Kotla        KV Sharma   
178511          Z Khan   1082635  Feroz Shah Kotla        KV Sharma   
178512          Z Khan   1082646  Feroz Shah Kotla           P Negi   

                       bowling_team        date  over      batting_team  \
0                    Mumbai Indians  2012-04-29    14   Deccan Chargers 

In [13]:
#_____________opponent-specific stats(Player vs player(bowler)
df['pvp_avg'] = df.groupby(['batter','bowler'])['batsman_runs']\
                           .expanding().mean().shift(1)\
                           .reset_index(level=[0,1], drop=True)
print(df.head())

           batter  match_id             venue        bowler    bowling_team  \
0  A Ashish Reddy    548346  Wankhede Stadium  JEC Franklin  Mumbai Indians   
1  A Ashish Reddy    548346  Wankhede Stadium  JEC Franklin  Mumbai Indians   
2  A Ashish Reddy    548346  Wankhede Stadium  JEC Franklin  Mumbai Indians   
3  A Ashish Reddy    548346  Wankhede Stadium      MM Patel  Mumbai Indians   
4  A Ashish Reddy    548346  Wankhede Stadium   RJ Peterson  Mumbai Indians   

         date  over     batting_team  total_runs  batsman_runs  is_wicket  \
0  2012-04-29    14  Deccan Chargers           0             0          0   
1  2012-04-29    14  Deccan Chargers           1             1          0   
2  2012-04-29    14  Deccan Chargers           2             2          0   
3  2012-04-29    15  Deccan Chargers           0             0          1   
4  2012-04-29    13  Deccan Chargers           0             0          0   

   rolling_avg_5  rolling_avg_wickets  overs_bowled_last5  ven

In [14]:
df['opponent_wickets']=(df.groupby(['bowler','batting_team'])['is_wicket'].transform('mean'))
print(df)

                batter  match_id             venue           bowler  \
0       A Ashish Reddy    548346  Wankhede Stadium     JEC Franklin   
1       A Ashish Reddy    548346  Wankhede Stadium     JEC Franklin   
2       A Ashish Reddy    548346  Wankhede Stadium     JEC Franklin   
3       A Ashish Reddy    548346  Wankhede Stadium         MM Patel   
4       A Ashish Reddy    548346  Wankhede Stadium      RJ Peterson   
...                ...       ...               ...              ...   
178508          Z Khan   1082635  Feroz Shah Kotla        HH Pandya   
178509          Z Khan   1082635  Feroz Shah Kotla  Harbhajan Singh   
178510          Z Khan   1082635  Feroz Shah Kotla        KV Sharma   
178511          Z Khan   1082635  Feroz Shah Kotla        KV Sharma   
178512          Z Khan   1082646  Feroz Shah Kotla           P Negi   

                       bowling_team        date  over      batting_team  \
0                    Mumbai Indians  2012-04-29    14   Deccan Chargers 

In [15]:
df['bowler_economy']=(
    df.groupby(['match_id','bowler'])['total_runs'].transform('sum')/
    (
        df.groupby(['match_id','bowler'])['over']
        .transform('nunique')
    )
)
print(df[['date','match_id','bowler','bowler_economy']])

              date  match_id           bowler  bowler_economy
0       2012-04-29    548346     JEC Franklin        5.333333
1       2012-04-29    548346     JEC Franklin        5.333333
2       2012-04-29    548346     JEC Franklin        5.333333
3       2012-04-29    548346         MM Patel        5.750000
4       2012-04-29    548346      RJ Peterson        8.000000
...            ...       ...              ...             ...
178508  2017-05-06   1082635        HH Pandya        2.000000
178509  2017-05-06   1082635  Harbhajan Singh        5.250000
178510  2017-05-06   1082635        KV Sharma        2.250000
178511  2017-05-06   1082635        KV Sharma        2.250000
178512  2017-05-14   1082646           P Negi        5.000000

[178513 rows x 4 columns]


In [16]:
#______________career stats
df['career_avg'] = df.groupby('batter')['batsman_runs']\
                           .expanding().mean().shift(1)\
                           .reset_index(level=0, drop=True)
print(df.head())


           batter  match_id             venue        bowler    bowling_team  \
0  A Ashish Reddy    548346  Wankhede Stadium  JEC Franklin  Mumbai Indians   
1  A Ashish Reddy    548346  Wankhede Stadium  JEC Franklin  Mumbai Indians   
2  A Ashish Reddy    548346  Wankhede Stadium  JEC Franklin  Mumbai Indians   
3  A Ashish Reddy    548346  Wankhede Stadium      MM Patel  Mumbai Indians   
4  A Ashish Reddy    548346  Wankhede Stadium   RJ Peterson  Mumbai Indians   

         date  over     batting_team  total_runs  batsman_runs  ...  \
0  2012-04-29    14  Deccan Chargers           0             0  ...   
1  2012-04-29    14  Deccan Chargers           1             1  ...   
2  2012-04-29    14  Deccan Chargers           2             2  ...   
3  2012-04-29    15  Deccan Chargers           0             0  ...   
4  2012-04-29    13  Deccan Chargers           0             0  ...   

   rolling_avg_5  rolling_avg_wickets  overs_bowled_last5  venue_avg  \
0            NaN          

In [17]:
df['bowler_career_wickets_avg']=df.groupby('bowler')['is_wicket']\
                           .expanding().mean().shift(1)\
                           .reset_index(level=0, drop=True)
print(df.head(6))


           batter  match_id             venue        bowler    bowling_team  \
0  A Ashish Reddy    548346  Wankhede Stadium  JEC Franklin  Mumbai Indians   
1  A Ashish Reddy    548346  Wankhede Stadium  JEC Franklin  Mumbai Indians   
2  A Ashish Reddy    548346  Wankhede Stadium  JEC Franklin  Mumbai Indians   
3  A Ashish Reddy    548346  Wankhede Stadium      MM Patel  Mumbai Indians   
4  A Ashish Reddy    548346  Wankhede Stadium   RJ Peterson  Mumbai Indians   
5  A Ashish Reddy    548346  Wankhede Stadium   RJ Peterson  Mumbai Indians   

         date  over     batting_team  total_runs  batsman_runs  ...  \
0  2012-04-29    14  Deccan Chargers           0             0  ...   
1  2012-04-29    14  Deccan Chargers           1             1  ...   
2  2012-04-29    14  Deccan Chargers           2             2  ...   
3  2012-04-29    15  Deccan Chargers           0             0  ...   
4  2012-04-29    13  Deccan Chargers           0             0  ...   
5  2012-04-29    13 

In [18]:
#___________Create target label
df['next_match_runs']=df.groupby('batter')['batsman_runs'].shift(-1)
df=df.dropna()
print(df.head())

            batter  match_id                                      venue  \
30  A Ashish Reddy    598004  Rajiv Gandhi International Stadium, Uppal   
31  A Ashish Reddy    598004  Rajiv Gandhi International Stadium, Uppal   
84  A Ashish Reddy    598048                      M Chinnaswamy Stadium   
44  A Ashish Reddy    598018    Maharashtra Cricket Association Stadium   
49  A Ashish Reddy    598018    Maharashtra Cricket Association Stadium   

           bowler                 bowling_team        date  over  \
30  R Vinay Kumar  Royal Challengers Bangalore  2013-04-07    17   
31  R Vinay Kumar  Royal Challengers Bangalore  2013-04-07    19   
84  R Vinay Kumar  Royal Challengers Bangalore  2013-04-09    18   
44       AB Dinda                Pune Warriors  2013-04-17    19   
49     AD Mathews                Pune Warriors  2013-04-17    18   

           batting_team  total_runs  batsman_runs  ...  overs_bowled_last5  \
30  Sunrisers Hyderabad           6             6  ...        

In [19]:
df['next_match_wicket'] = df.groupby('bowler')['is_wicket'].shift(-1)
df=df.dropna()
print(df.head(5))

            batter  match_id                                      venue  \
30  A Ashish Reddy    598004  Rajiv Gandhi International Stadium, Uppal   
31  A Ashish Reddy    598004  Rajiv Gandhi International Stadium, Uppal   
84  A Ashish Reddy    598048                      M Chinnaswamy Stadium   
44  A Ashish Reddy    598018    Maharashtra Cricket Association Stadium   
49  A Ashish Reddy    598018    Maharashtra Cricket Association Stadium   

           bowler                 bowling_team        date  over  \
30  R Vinay Kumar  Royal Challengers Bangalore  2013-04-07    17   
31  R Vinay Kumar  Royal Challengers Bangalore  2013-04-07    19   
84  R Vinay Kumar  Royal Challengers Bangalore  2013-04-09    18   
44       AB Dinda                Pune Warriors  2013-04-17    19   
49     AD Mathews                Pune Warriors  2013-04-17    18   

           batting_team  total_runs  batsman_runs  ...  venue_avg  \
30  Sunrisers Hyderabad           6             6  ...   1.971831   
31

In [20]:
## _________________Feature selection for Batsman
features=df[['batter','rolling_avg_5','venue_avg','pvt_avg','pvp_avg','career_avg']]
labels  =df[['next_match_runs']]
print(features.head(10))
print(labels)

            batter  rolling_avg_5  venue_avg  pvt_avg  pvp_avg  career_avg
30  A Ashish Reddy            2.8   1.971831     1.50     1.50    1.666667
31  A Ashish Reddy            2.6   1.971831     2.40     2.40    1.806452
84  A Ashish Reddy            2.4   1.971831     2.00     2.00    1.750000
44  A Ashish Reddy            1.2   1.971831     2.00     2.00    1.723404
49  A Ashish Reddy            2.6   1.971831     1.75     1.75    1.711538
64  A Ashish Reddy            1.8   1.971831     1.00     1.00    1.761194
65  A Ashish Reddy            1.0   1.971831     0.80     0.80    1.735294
66  A Ashish Reddy            2.6   1.971831     1.00     1.00    1.739130
67  A Ashish Reddy            3.6   1.971831     2.00     2.00    1.828571
87  A Ashish Reddy            1.2   1.971831     2.50     2.50    1.839080
        next_match_runs
30                  0.0
31                  2.0
84                  1.0
44                  0.0
49                  3.0
...                 ...
178496 

In [21]:
print(df.columns.tolist())

['batter', 'match_id', 'venue', 'bowler', 'bowling_team', 'date', 'over', 'batting_team', 'total_runs', 'batsman_runs', 'is_wicket', 'rolling_avg_5', 'rolling_avg_wickets', 'overs_bowled_last5', 'venue_avg', 'venue_avg_wickets', 'pvt_avg', 'pvp_avg', 'opponent_wickets', 'bowler_economy', 'career_avg', 'bowler_career_wickets_avg', 'next_match_runs', 'next_match_wicket']


In [22]:
##_________________________feature selection for bowler
features = df[['bowler','rolling_avg_wickets','venue_avg_wickets','opponent_wickets','bowler_economy','bowler_career_wickets_avg']]
labels =df[['next_match_wicket']]
print('Bowler Features:')
print(features.head())
print(labels)

Bowler Features:
           bowler  rolling_avg_wickets  venue_avg_wickets  opponent_wickets  \
30  R Vinay Kumar                  0.2           0.114286          0.045977   
31  R Vinay Kumar                  0.2           0.114286          0.045977   
84  R Vinay Kumar                  0.2           0.087193          0.045977   
44       AB Dinda                  0.0           0.093333          0.094737   
49     AD Mathews                  0.0           0.035714          0.018868   

    bowler_economy  bowler_career_wickets_avg  
30            7.20                   0.250000  
31            7.20                   0.200000  
84            8.75                   0.333333  
44            5.25                   0.000000  
49            5.50                   0.000000  
        next_match_wicket
30                    1.0
31                    0.0
84                    0.0
44                    0.0
49                    0.0
...                   ...
178496                0.0
178500      

In [23]:
# sort chronologically for batsman
df = df.sort_values(['batter', 'date'])

# features and label
X = df[['rolling_avg_5', 'venue_avg', 'pvt_avg', 'pvp_avg', 'career_avg']]
y = df[['next_match_runs']]

# time-based split
split = int(len(df) * 0.8)

X_train = X.iloc[:split]
X_test  = X.iloc[split:]

y_train = y.iloc[:split]
y_test  = y.iloc[split:]

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)


(138274, 5) (34569, 5)
(138274, 1) (34569, 1)


In [24]:
# sort chronologically for bowler

# features and label
X = df[['rolling_avg_wickets','venue_avg_wickets','opponent_wickets','bowler_economy','bowler_career_wickets_avg']]
y = df[['next_match_wicket']]

# time-based split
split = int(len(df) * 0.8)

X_train = X.iloc[:split]
X_test  = X.iloc[split:]

y_train = y.iloc[:split]
y_test  = y.iloc[split:]

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)


(138274, 5) (34569, 5)
(138274, 1) (34569, 1)


In [25]:
#_________________Feature pipeline
pipeline=Pipeline([
    ('scaler',StandardScaler())
])
pipeline.fit(X_train)
joblib.dump(pipeline,"scripts/feature_pipeline.pkl")

['scripts/feature_pipeline.pkl']

In [26]:
#____________save Final Dataset
final_df = df[['batter','date','venue','bowler','bowling_team','rolling_avg_5','venue_avg','pvt_avg','pvp_avg','career_avg','next_match_runs','next_match_wicket','rolling_avg_wickets',
        'venue_avg_wickets','opponent_wickets','bowler_economy','bowler_career_wickets_avg']]
print(final_df.head())

            batter        date                                      venue  \
30  A Ashish Reddy  2013-04-07  Rajiv Gandhi International Stadium, Uppal   
31  A Ashish Reddy  2013-04-07  Rajiv Gandhi International Stadium, Uppal   
84  A Ashish Reddy  2013-04-09                      M Chinnaswamy Stadium   
44  A Ashish Reddy  2013-04-17    Maharashtra Cricket Association Stadium   
49  A Ashish Reddy  2013-04-17    Maharashtra Cricket Association Stadium   

           bowler                 bowling_team  rolling_avg_5  venue_avg  \
30  R Vinay Kumar  Royal Challengers Bangalore            2.8   1.971831   
31  R Vinay Kumar  Royal Challengers Bangalore            2.6   1.971831   
84  R Vinay Kumar  Royal Challengers Bangalore            2.4   1.971831   
44       AB Dinda                Pune Warriors            1.2   1.971831   
49     AD Mathews                Pune Warriors            2.6   1.971831   

    pvt_avg  pvp_avg  career_avg  next_match_runs  next_match_wicket  \
30     1

In [27]:
final_df.to_csv("data/processed/dataset.csv",index=False)
print("Milestone-2 is completed:dataset.csv & feature_pipeline.pkl created!")

Milestone-2 is completed:dataset.csv & feature_pipeline.pkl created!
