In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
import timeit
import datetime as dt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier


In [2]:
df_match_summary = pd.read_csv('datasets/ipl-matches.csv')
df_match_summary.head()

Unnamed: 0,match_id,city,competition,gender,match_type,#_of_overs,player_of_match,umpires,venue,winner,...,neutral_venue,method,result,eliminator,outcome_by_runs,outcome_by_wickets,match_date,season,team1,team2
0,548334,Mumbai,IPL,male,T20,20,['SE Marsh'],"['S Ravi', 'RJ Tucker']",Wankhede Stadium,Kings XI Punjab,...,,,,,,6.0,2012-04-22,2012,Mumbai Indians,Kings XI Punjab
1,548327,Bangalore,IPL,male,T20,20,['CH Gayle'],"['S Asnani', 'S Das']",M Chinnaswamy Stadium,Royal Challengers Bangalore,...,,,,,,6.0,2012-04-17,2012,Royal Challengers Bangalore,Pune Warriors
2,1254082,Ahmedabad,IPL,male,T20,20,['PP Shaw'],"['AK Chaudhary', 'YC Barde']","Narendra Modi Stadium, Ahmedabad",Delhi Capitals,...,,,,,,7.0,2021-04-29,2021,Kolkata Knight Riders,Delhi Capitals
3,598052,Chandigarh,IPL,male,T20,20,['KK Cooper'],"['HDPK Dharmasena', 'S Ravi']","Punjab Cricket Association Stadium, Mohali",Rajasthan Royals,...,,,,,,8.0,2013-05-09,2013,Kings XI Punjab,Rajasthan Royals
4,392236,Centurion,IPL,male,T20,20,['MK Pandey'],"['IL Howell', 'S Ravi']",SuperSport Park,Royal Challengers Bangalore,...,1.0,,,,12.0,,2009-05-21,2009,Royal Challengers Bangalore,Deccan Chargers


In [3]:
# There is no need to load the match_summary fields onto details these can always be linked by the match_id
# Use usecols to read only the fields needed for processing

df_details = pd.read_csv('datasets/details.csv',\
                         usecols=lambda column : column not in ['Unnamed: 0' ,'city','venue','competition',\
                                                                'decision','eliminator','gender','match_type',\
                                                               'method','neutral_venue','#_of_overs','player_of_match',\
                                                               'result','team1','team2','umpires','winner','winner_toss',\
                                                                'outcome_by_runs','outcome_by_wickets',\
                                                               'match_date','season'])
df_details.head()

Unnamed: 0,inning,delivery,batsman,bowler,non_striker,batsman_runs,extras_runs,total,fielders,kind,player_out,legbyes,wides,batting_team,match_id,byes,noballs,non_boundary,penalty,bowling_team
0,inning_1,0.1,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,Mumbai Indians,548334,,,,,Kings XI Punjab
1,inning_1,0.2,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,Mumbai Indians,548334,,,,,Kings XI Punjab
2,inning_1,0.3,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,Mumbai Indians,548334,,,,,Kings XI Punjab
3,inning_1,0.4,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,Mumbai Indians,548334,,,,,Kings XI Punjab
4,inning_1,0.5,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,Mumbai Indians,548334,,,,,Kings XI Punjab


In [4]:
df_match_summary['match_date'] = pd.to_datetime(df_match_summary['match_date'])
df_match_summary['season'] = df_match_summary['match_date'].dt.year
df_match_summary.head(2)

Unnamed: 0,match_id,city,competition,gender,match_type,#_of_overs,player_of_match,umpires,venue,winner,...,neutral_venue,method,result,eliminator,outcome_by_runs,outcome_by_wickets,match_date,season,team1,team2
0,548334,Mumbai,IPL,male,T20,20,['SE Marsh'],"['S Ravi', 'RJ Tucker']",Wankhede Stadium,Kings XI Punjab,...,,,,,,6.0,2012-04-22,2012,Mumbai Indians,Kings XI Punjab
1,548327,Bangalore,IPL,male,T20,20,['CH Gayle'],"['S Asnani', 'S Das']",M Chinnaswamy Stadium,Royal Challengers Bangalore,...,,,,,,6.0,2012-04-17,2012,Royal Challengers Bangalore,Pune Warriors


In [5]:
df_match_summary['player_of_match'] = df_match_summary['player_of_match'].str[2:-2]
df_match_summary.rename(columns={'decision':'toss_decision'}, inplace=True)

In [6]:
df_match_summary.drop(['umpires'], axis=1, inplace=True)

In [7]:
team_names = {'Rajasthan Royals':'RR',\
              'Mumbai Indians' : 'MI',\
              'Delhi Daredevils':'DD',\
              'Kolkata Knight Riders':'KKR',\
              'Kings XI Punjab':'KXP',\
              'Sunrisers Hyderabad':'SRH',\
              'Deccan Chargers':'DC-Chargers',\
              'Chennai Super Kings':'CSK',\
              'Rising Pune Supergiants':'RPS',\
              'Royal Challengers Bangalore':'RCB',\
              'Pune Warriors':'PW',\
              'Gujarat Lions':'GL',\
              'Delhi Capitals':'DC-Capitals',\
              'Rising Pune Supergiant':'RPS',\
              'Kochi Tuskers Kerala':'KTK'
             }

In [8]:
df_match_summary['winner_toss'] = df_match_summary['winner_toss'].map(team_names)
df_match_summary['winner'] = df_match_summary['winner'].map(team_names)
df_match_summary['team1'] = df_match_summary['team1'].map(team_names)
df_match_summary['team2'] = df_match_summary['team2'].map(team_names)

In [9]:
df_match_summary.drop(['competition','gender','match_type','#_of_overs'], axis=1, inplace=True)

In [10]:
venue_names = {'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium':'YSR Visakhapatnam','M.Chinnaswamy Stadium':'Chinnaswamy, Bengaluru','M Chinnaswamy Stadium':'Chinnaswamy, Bengaluru',
               'Subrata Roy Sahara Stadium':'Sahara, Pune','Kingsmead':'Kingsmead, Durban','Wankhede Stadium':'Wankhede, Mumbai','Dr DY Patil Sports Academy':'DY Patil, Mumbai',
               'Brabourne Stadium':'Brabourne, Mumbai','Eden Gardens':'Eden Gardens, Kolkata','Sawai Mansingh Stadium':'Sawai Mansingh, Jaipur','Newlands':'Newlands, Capetown',
               'Himachal Pradesh Cricket Association Stadium':'HPCA, Dharamsala','Rajiv Gandhi International Stadium, Uppal':'Rajiv Gandhi, Hyderabad','JSCA International Stadium Complex':'JSCA, Ranchi',
               'Maharashtra Cricket Association Stadium':'MCAS, Pune','Feroz Shah Kotla':'Feroz Shah Kotla, Delhi','De Beers Diamond Oval':'De Beers, Kimberley',
               'MA Chidambaram Stadium, Chepauk':'MA Chidambaram Stadium, Chennai','Punjab Cricket Association IS Bindra Stadium, Mohali':'PCA, Mohali','Nehru Stadium':'Nehru, Kochi',
               'Green Park':'Green Park, Kanpur','Saurashtra Cricket Association Stadium':'SCA, Rajkot','Vidarbha Cricket Association Stadium, Jamtha':'VCA, Nagpur',
               'Punjab Cricket Association Stadium, Mohali':'PCA, Mohali','Shaheed Veer Narayan Singh International Stadium':'Shaheed Veer Narayan Singh, Raipur',
               'SuperSport Park':'Centurion, Gauteng','Holkar Cricket Stadium':'Holkar, Indore','Sardar Patel Stadium, Motera':'Sardar Patel, Ahmedabad','Barabati Stadium':'Barabati, Cuttack',
               'Sharjah Cricket Stadium':'Sharjah','Dubai International Cricket Stadium':'Dubai','Sheikh Zayed Stadium':'Abu Dhabi','OUTsurance Oval':'Bloemfontein',
               "St George's Park":'Port Elizabeth','Buffalo Park':'East London','New Wanderers Stadium':'Johannesburg'}

In [11]:
df_match_summary['venue'] = df_match_summary['venue'].map(venue_names)
df_match_summary.drop(['city'], axis=1, inplace=True)

In [12]:
df_details['batting_team'] = df_details['batting_team'].map(team_names)
df_details['bowling_team'] = df_details['bowling_team'].map(team_names)

In [13]:
df_details.head()

Unnamed: 0,inning,delivery,batsman,bowler,non_striker,batsman_runs,extras_runs,total,fielders,kind,player_out,legbyes,wides,batting_team,match_id,byes,noballs,non_boundary,penalty,bowling_team
0,inning_1,0.1,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,MI,548334,,,,,KXP
1,inning_1,0.2,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,MI,548334,,,,,KXP
2,inning_1,0.3,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,MI,548334,,,,,KXP
3,inning_1,0.4,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,MI,548334,,,,,KXP
4,inning_1,0.5,JEC Franklin,P Kumar,SR Tendulkar,0.0,0.0,0.0,,,,,,MI,548334,,,,,KXP


In [14]:
df_details.drop(['fielders'], axis=1, inplace=True)

In [15]:
df_details['delivery'] = df_details['delivery'].astype(str)
overs = {'0.1': '1','0.2': '1', '0.3': '1', '0.4': '1', '0.5': '1', '0.6': '1', '0.7': '1', '0.8': '1', '0.9': '1',\
         '1.1': '2','1.2': '2', '1.3': '2', '1.4': '2', '1.5': '2', '1.6': '2', '1.7': '2', '1.8': '2', '1.9': '2',\
         '2.1': '3','2.2': '3', '2.3': '3', '2.4': '3', '2.5': '3', '2.6': '3', '2.7': '3', '2.8': '3', '2.9': '3',\
         '3.1': '4','3.2': '4', '3.3': '4', '3.4': '4', '3.5': '4', '3.6': '4', '3.7': '4', '3.8': '4', '3.9': '4',\
         '4.1': '5','4.2': '5', '4.3': '5', '4.4': '5', '4.5': '5', '4.6': '5', '4.7': '5', '4.8': '5', '4.9': '5',\
         '5.1': '6','5.2': '6', '5.3': '6', '5.4': '6', '5.5': '6', '5.6': '6', '5.7': '6', '5.8': '6', '5.9': '6',\
         '6.1': '7','6.2': '7', '6.3': '7', '6.4': '7', '6.5': '7', '6.6': '7', '6.7': '7', '6.8': '7', '6.9': '7',\
         '7.1': '8','7.2': '8', '7.3': '8', '7.4': '8', '7.5': '8', '7.6': '8', '7.7': '8', '7.8': '8', '7.9': '8',\
         '8.1': '9','8.2': '9', '8.3': '9', '8.4': '9', '8.5': '9', '8.6': '9', '8.7': '9', '8.8': '9', '8.9': '9',\
         '9.1': '10','9.2': '10', '9.3': '10', '9.4': '10', '9.5': '10', '9.6': '10', '9.7': '10', '9.8': '10', '9.9': '10',\
         '10.1': '11','10.2': '11', '10.3': '11', '10.4': '11', '10.5': '11', '10.6': '11', '10.7': '11', '10.8': '11', '10.9': '11',\
         '11.1': '12','11.2': '12', '11.3': '12', '11.4': '12', '11.5': '12', '11.6': '12', '11.7': '12', '11.8': '12', '11.9': '12',\
         '12.1': '13','12.2': '13', '12.3': '13', '12.4': '13', '12.5': '13', '12.6': '13', '12.7': '13', '12.8': '13', '12.9': '13',\
         '13.1': '14','13.2': '14', '13.3': '14', '13.4': '14', '13.5': '14', '13.6': '14', '13.7': '14', '13.8': '14', '13.9': '14',\
         '14.1': '15','14.2': '15', '14.3': '15', '14.4': '15', '14.5': '15', '14.6': '15', '14.7': '15', '14.8': '15', '14.9': '15',\
         '15.1': '16','15.2': '16', '15.3': '16', '15.4': '16', '15.5': '16', '15.6': '16', '15.7': '16', '15.8': '16', '15.9': '16',\
         '16.1': '17','16.2': '17', '16.3': '17', '16.4': '17', '16.5': '17', '16.6': '17', '16.7': '17', '16.8': '17', '16.9': '17',\
         '17.1': '18','17.2': '18', '17.3': '18', '17.4': '18', '17.5': '18', '17.6': '18', '17.7': '18', '17.8': '18', '17.9': '18',\
         '18.1': '19','18.2': '19', '18.3': '19', '18.4': '19', '18.5': '19', '18.6': '19', '18.7': '19', '18.8': '19', '18.9': '19',\
         '19.1': '20','19.2': '20', '19.3': '20', '19.4': '20', '19.5': '20', '19.6': '20', '19.7': '20', '19.8': '20', '19.9': '20',\
        }
df_details['over'] = df_details['delivery'].map(overs)

In [16]:
delivery_map = {'0.1': '1','0.2': '2', '0.3': '3', '0.4': '4', '0.5': '5', '0.6': '6', '0.7': '7', '0.8': '8', '0.9': '9',\
         '1.1': '1','1.2': '2', '1.3': '3', '1.4': '4', '1.5': '5', '1.6': '6', '1.7': '7', '1.8': '8', '1.9': '9',\
         '2.1': '1','2.2': '2', '2.3': '3', '2.4': '4', '2.5': '5', '2.6': '6', '2.7': '7', '2.8': '8', '2.9': '9',\
         '3.1': '1','3.2': '2', '3.3': '3', '3.4': '4', '3.5': '5', '3.6': '6', '3.7': '7', '3.8': '8', '3.9': '9',\
         '4.1': '1','4.2': '2', '4.3': '3', '4.4': '4', '4.5': '5', '4.6': '6', '4.7': '7', '4.8': '8', '4.9': '9',\
         '5.1': '1','5.2': '2', '5.3': '3', '5.4': '4', '5.5': '5', '5.6': '6', '5.7': '7', '5.8': '8', '5.9': '9',\
         '6.1': '1','6.2': '2', '6.3': '3', '6.4': '4', '6.5': '5', '6.6': '6', '6.7': '7', '6.8': '8', '6.9': '9',\
         '7.1': '1','7.2': '2', '7.3': '3', '7.4': '4', '7.5': '5', '7.6': '6', '7.7': '7', '7.8': '8', '7.9': '9',\
         '8.1': '1','8.2': '2', '8.3': '3', '8.4': '4', '8.5': '5', '8.6': '6', '8.7': '7', '8.8': '8', '8.9': '9',\
         '9.1': '1','9.2': '2', '9.3': '3', '9.4': '4', '9.5': '5', '9.6': '6', '9.7': '7', '9.8': '8', '9.9': '9',\
         '10.1': '1','10.2': '2', '10.3': '3', '10.4': '4', '10.5': '5', '10.6': '6', '10.7': '7', '10.8': '8', '10.9': '9',\
         '11.1': '1','11.2': '2', '11.3': '3', '11.4': '4', '11.5': '5', '11.6': '6', '11.7': '7', '11.8': '8', '11.9': '9',\
         '12.1': '1','12.2': '2', '12.3': '3', '12.4': '4', '12.5': '5', '12.6': '6', '12.7': '7', '12.8': '8', '12.9': '9',\
         '13.1': '1','13.2': '2', '13.3': '3', '13.4': '4', '13.5': '5', '13.6': '6', '13.7': '7', '13.8': '8', '13.9': '9',\
         '14.1': '1','14.2': '2', '14.3': '3', '14.4': '4', '14.5': '5', '14.6': '6', '14.7': '7', '14.8': '8', '14.9': '9',\
         '15.1': '1','15.2': '2', '15.3': '3', '15.4': '4', '15.5': '5', '15.6': '6', '15.7': '7', '15.8': '8', '15.9': '9',\
         '16.1': '1','16.2': '2', '16.3': '3', '16.4': '4', '16.5': '5', '16.6': '6', '16.7': '7', '16.8': '8', '16.9': '9',\
         '17.1': '1','17.2': '2', '17.3': '3', '17.4': '4', '17.5': '5', '17.6': '6', '17.7': '7', '17.8': '8', '17.9': '9',\
         '18.1': '1','18.2': '2', '18.3': '3', '18.4': '4', '18.5': '5', '18.6': '6', '18.7': '7', '18.8': '8', '18.9': '9',\
         '19.1': '1','19.2': '2', '19.3': '3', '19.4': '4', '19.5': '5', '19.6': '6', '19.7': '7', '19.8': '8', '19.9': '9',\
        }
df_details['delivery_in_over'] = df_details['delivery'].map(delivery_map)

In [17]:
df_details['over'] = df_details['over'].astype(str)
inning_quarter = {'1': 'Q1','2': 'Q1', '3': 'Q1', '4': 'Q1', '5': 'Q1', \
                '6': 'Q2', '7': 'Q2', '8': 'Q2', '9': 'Q2', '10': 'Q2',\
                '11': 'Q3','12': 'Q3', '13': 'Q3', '14': 'Q3', '15': 'Q3',\
                '16': 'Q4', '17': 'Q4', '18': 'Q4', '19': 'Q4', '20': 'Q4'\
                }
df_details['inning_quarter'] = df_details['over'].map(inning_quarter)

In [18]:
df_details.drop(['delivery'], axis=1, inplace=True)

In [19]:
df_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200247 entries, 0 to 200246
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   inning            200247 non-null  object 
 1   batsman           200236 non-null  object 
 2   bowler            200236 non-null  object 
 3   non_striker       200236 non-null  object 
 4   batsman_runs      200236 non-null  float64
 5   extras_runs       199595 non-null  float64
 6   total             200236 non-null  float64
 7   kind              9826 non-null    object 
 8   player_out        9826 non-null    object 
 9   legbyes           3192 non-null    float64
 10  wides             6066 non-null    float64
 11  batting_team      199275 non-null  object 
 12  match_id          200247 non-null  int64  
 13  byes              526 non-null     float64
 14  noballs           806 non-null     float64
 15  non_boundary      18 non-null      float64
 16  penalty           2 

In [20]:
df_match_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 845 entries, 0 to 844
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   match_id            845 non-null    int64         
 1   player_of_match     841 non-null    object        
 2   venue               767 non-null    object        
 3   winner              842 non-null    object        
 4   toss_decision       845 non-null    object        
 5   winner_toss         825 non-null    object        
 6   neutral_venue       77 non-null     float64       
 7   method              19 non-null     object        
 8   result              18 non-null     object        
 9   eliminator          14 non-null     object        
 10  outcome_by_runs     376 non-null    float64       
 11  outcome_by_wickets  451 non-null    float64       
 12  match_date          845 non-null    datetime64[ns]
 13  season              845 non-null    int64         

In [21]:
df_match = df_match_summary[['match_id', 'venue', 'match_date', 'season', 'toss_decision', 'winner_toss']]
toss = {'field':1, 'bat':2}
df_match['toss_decision'] = df_match['toss_decision'].map(toss)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_match['toss_decision'] = df_match['toss_decision'].map(toss)


In [22]:
train = df_match.sample(frac=0.8,random_state=200)

In [23]:
test = df_match.drop(train.index)

In [24]:
train.head()

Unnamed: 0,match_id,venue,match_date,season,toss_decision,winner_toss
64,392226,Bloemfontein,2009-05-15,2009,1,KXP
53,1178427,,2019-05-03,2019,1,KKR
39,1216526,Abu Dhabi,2020-10-16,2020,2,KKR
88,1136592,,2018-05-02,2018,1,RR
313,598004,"Rajiv Gandhi, Hyderabad",2013-04-07,2013,2,


In [25]:
test.head()

Unnamed: 0,match_id,venue,match_date,season,toss_decision,winner_toss
0,548334,"Wankhede, Mumbai",2012-04-22,2012,2,MI
2,1254082,,2021-04-29,2021,1,DC-Capitals
6,1254061,,2021-04-12,2021,1,RR
7,392183,"Newlands, Capetown",2009-04-19,2009,1,DD
17,1175372,"Chinnaswamy, Bengaluru",2019-04-05,2019,1,KKR


In [26]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 676 entries, 64 to 387
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   match_id       676 non-null    int64         
 1   venue          615 non-null    object        
 2   match_date     676 non-null    datetime64[ns]
 3   season         676 non-null    int64         
 4   toss_decision  676 non-null    int64         
 5   winner_toss    659 non-null    object        
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 37.0+ KB


In [27]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169 entries, 0 to 842
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   match_id       169 non-null    int64         
 1   venue          152 non-null    object        
 2   match_date     169 non-null    datetime64[ns]
 3   season         169 non-null    int64         
 4   toss_decision  169 non-null    int64         
 5   winner_toss    166 non-null    object        
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 9.2+ KB


In [28]:
df_details.drop(['extras_runs'], axis=1, inplace=True)
df_details.drop(['batsman_runs'], axis=1, inplace=True)
df_details.drop(['non_boundary'], axis=1, inplace=True)
df_details.drop(['penalty'], axis=1, inplace=True)

In [29]:
inn = {'inning_1':1, 'inning_2':2}
df_details['inning'] = df_details['inning'].map(inn)
df_details.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200247 entries, 0 to 200246
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   inning            200247 non-null  int64  
 1   batsman           200236 non-null  object 
 2   bowler            200236 non-null  object 
 3   non_striker       200236 non-null  object 
 4   total             200236 non-null  float64
 5   kind              9826 non-null    object 
 6   player_out        9826 non-null    object 
 7   legbyes           3192 non-null    float64
 8   wides             6066 non-null    float64
 9   batting_team      199275 non-null  object 
 10  match_id          200247 non-null  int64  
 11  byes              526 non-null     float64
 12  noballs           806 non-null     float64
 13  bowling_team      199339 non-null  object 
 14  over              200247 non-null  object 
 15  delivery_in_over  200247 non-null  object 
 16  inning_quarter    20

In [44]:
df_train = pd.merge(train, df_details, how="inner", on="match_id")
df_test =  pd.merge(test, df_details, how="inner", on="match_id")

In [45]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160173 entries, 0 to 160172
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   match_id          160173 non-null  int64         
 1   venue             145512 non-null  object        
 2   match_date        160173 non-null  datetime64[ns]
 3   season            160173 non-null  int64         
 4   toss_decision     160173 non-null  int64         
 5   winner_toss       156664 non-null  object        
 6   inning            160173 non-null  int64         
 7   batsman           160165 non-null  object        
 8   bowler            160165 non-null  object        
 9   non_striker       160165 non-null  object        
 10  total             160165 non-null  float64       
 11  kind              7838 non-null    object        
 12  player_out        7838 non-null    object        
 13  legbyes           2529 non-null    float64       
 14  wide

In [46]:
trainY = df_train['total']
trainX = df_train.loc[:, df_train.columns != 'total']
trainY.head()
trainX.head()

Unnamed: 0,match_id,venue,match_date,season,toss_decision,winner_toss,inning,batsman,bowler,non_striker,...,player_out,legbyes,wides,batting_team,byes,noballs,bowling_team,over,delivery_in_over,inning_quarter
0,392226,Bloemfontein,2009-05-15,2009,1,KXP,1,G Gambhir,B Lee,V Sehwag,...,,,,DD,,,KXP,1,1,Q1
1,392226,Bloemfontein,2009-05-15,2009,1,KXP,1,G Gambhir,B Lee,V Sehwag,...,,,,DD,,,KXP,1,2,Q1
2,392226,Bloemfontein,2009-05-15,2009,1,KXP,1,G Gambhir,B Lee,V Sehwag,...,,,,DD,,,KXP,1,3,Q1
3,392226,Bloemfontein,2009-05-15,2009,1,KXP,1,V Sehwag,B Lee,G Gambhir,...,,,,DD,,,KXP,1,4,Q1
4,392226,Bloemfontein,2009-05-15,2009,1,KXP,1,V Sehwag,B Lee,G Gambhir,...,,1.0,,DD,,,KXP,1,5,Q1


In [47]:
trainY.head()

0    0.0
1    0.0
2    1.0
3    4.0
4    1.0
Name: total, dtype: float64

In [48]:
testY = df_test['total']
testX = df_test.loc[:, df_test.columns != 'total']
testX.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40074 entries, 0 to 40073
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   match_id          40074 non-null  int64         
 1   venue             35939 non-null  object        
 2   match_date        40074 non-null  datetime64[ns]
 3   season            40074 non-null  int64         
 4   toss_decision     40074 non-null  int64         
 5   winner_toss       39515 non-null  object        
 6   inning            40074 non-null  int64         
 7   batsman           40071 non-null  object        
 8   bowler            40071 non-null  object        
 9   non_striker       40071 non-null  object        
 10  kind              1988 non-null   object        
 11  player_out        1988 non-null   object        
 12  legbyes           663 non-null    float64       
 13  wides             1208 non-null   float64       
 14  batting_team      3970

In [49]:
def calc_metrics(model_object, X_test, y_test, start_timer, stop_timer):
  '''
  This function evaluates the performance of the model given the model, test values and training time
  INPUT: Model, X_test, y_test, start_time, end_time
  OUTPUT: A dictionary containing the key metrics such as accuracy, precision, recall, confusion_matrix, f1, and training time 
  '''
  y_pred = model_object.predict(X_test)
  
  report_accuracy = model_object.best_score_
  report_test_accuracy = accuracy_score(y_test, y_pred)
  report_conf_matrix = confusion_matrix(y_test, y_pred)
  report_precision_rate = precision_score(y_test, y_pred)
  report_recall_rate = recall_score(y_test, y_pred)
  report_f1 = f1_score(y_test, y_pred)
  
  performance_metrics = {'training_accuracy':report_accuracy,\
                         'training_time': stop_timer - start_timer,\
                         'test_accuracy': report_test_accuracy,\
                         'confusion_matrix': report_conf_matrix,\
                         'precision': report_precision_rate,\
                         'recall': report_recall_rate,\
                         'f1': report_f1\
                         
                        }
  return performance_metrics


In [50]:
pca_components = 2
start = timeit.default_timer()


# Construct pipelines for each including an Imputer, Scaler, PCA and classifier
pipe_logreg = Pipeline([('imp', SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)),\
                    ('scl', StandardScaler()),\
                    ('pca', PCA(n_components=pca_components)),
                    ('clf', LogisticRegression(random_state=42,solver='lbfgs'))\
                   ])

pipe_svm = Pipeline([('imp', SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)),\
                     ('scl', StandardScaler()),\
                     ('pca', PCA(n_components=pca_components)),\
                     ('clf', SVC(random_state=42))\
                    ])
			
pipe_decisiontree = Pipeline([('imp', SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)),\
                     ('scl', StandardScaler()),\
                     ('pca', PCA(n_components=pca_components)),\
                     ('clf', DecisionTreeClassifier(random_state=42))\
                   ])

pipe_adaboost = Pipeline([('imp', SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)),\
                     ('scl', StandardScaler()),\
                     ('pca', PCA(n_components=pca_components)),\
                     ('clf', AdaBoostClassifier(random_state=42))\
                   ])

pipe_randomforest = Pipeline([('imp', SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)),\
                     ('scl', StandardScaler()),\
                     ('pca', PCA(n_components=pca_components)),\
                     ('clf', RandomForestClassifier(random_state=42))\
                   ])

pipe_gradboost = Pipeline([('imp', SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)),\
                     ('scl', StandardScaler()),\
                     ('pca', PCA(n_components=pca_components)),\
                     ('clf', GradientBoostingClassifier(random_state=42))\
                   ])

pipe_mlp = Pipeline([('imp', SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)),\
                     ('scl', StandardScaler()),\
                     ('pca', PCA(n_components=pca_components)),\
                     ('clf', MLPClassifier(random_state=42, activation='relu', solver='adam', max_iter=300))\
                   ])


# List of pipelines for ease of iteration
pipelines = [pipe_logreg, pipe_svm, pipe_decisiontree, pipe_adaboost, pipe_randomforest, pipe_gradboost, pipe_mlp]

# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Support Vector Machine', 2: 'Decision Tree', 3: 'AdaBoost', 4: 'RandomForest', 5: 'GradientBoost', 6: 'MultiLayerPerceptron'}

# Set grid search params
grid_params = [{'clf__C': np.logspace(-5, 8, 20)},\
               
               {'clf__kernel': ['linear', 'rbf', 'poly'],\
                'clf__C': np.linspace(1,10,10),\
                'clf__gamma':[1e-4, 1e-3, 1e-2, 1e-1]},\
               
               {'clf__criterion': ['gini', 'entropy'],\
                'clf__min_samples_leaf': [1, 2, 3, 4, 5],\
                'clf__max_depth': [1, 2, 3, 4, 5],\
                'clf__min_samples_split': [1, 2, 3, 4, 5][1:],\
                'clf__presort': [True, False]},\
               
               {'clf__n_estimators': [10,20,30,40,50,60,70,80,90,100,150],\
                'clf__learning_rate': np.logspace(-5,0,15)},\
               
               {'clf__n_estimators': [10,20,30,40,50,60,70,80,90,100,150],\
                'clf__max_depth': [1,2,3,4,5,6,7,8,9,10]},\

               {'clf__n_estimators': [10,20,30,40,50,60,70,80,90,100,150],\
                'clf__learning_rate': np.logspace(-5,0,15)},\
               
               {'clf__hidden_layer_sizes': [(500, 10),(500, ),(400, 10),(400, ),(300, 10),(300, ),(200, 10),(200, ),(100, 10),(100, )],\
                'clf__learning_rate':['constant', 'invscaling', 'adaptive']}\
                            
             ]

model_file_names = ['logistic_regression.pkl','svm.pkl','decision_tree.pkl',\
                    'adaboost.pkl','random_forest.pkl','gradient_boos.pkl',\
                    'multi_layer_perceptron.pkl']

classifier_scores=[]

best_models=[]
# Fit the pipelines
for pipe, param, model_file in zip(pipelines, grid_params, model_file_names):
  
  # Start Timer for model fit
  model_start_time = timeit.default_timer()
  # Construct grid search
  gs = GridSearchCV(estimator=pipe,\
                    param_grid=param,\
                    scoring='accuracy',\
                    cv=5)
	# Fit using grid search
  gs.fit(trainX, trainY)
  # Best accuracy
  print('Best accuracy: %.3f' % gs.best_score_)
  # Best params
  print('\nBest params:\n', gs.best_params_)
  model_stop_time = timeit.default_timer()
  
  classifier_scores.append(calc_metrics(gs, testX, testY, model_start_time, model_stop_time))
  joblib.dump(gs.best_estimator_, model_file)
  best_models.append(model_file)

stop = timeit.default_timer()

print('Total Execution Time: ', stop - start)

df_model_performance = pd.DataFrame(classifier_scores, \
                                    index=['Logistic Regression', 'Support Vector Machine', \
                                           'Decision Tree', 'AdaBoost', 'RandomForest', \
                                           'GradientBoost', 'MultiLayerPerceptron'\
                                          ]\
                                   )
df_model_performance



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').