### Data Pre-Processing

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

df_matches = pd.read_csv("matches.csv")
df_matches.head(3)

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,2,2017,Pune,2017-04-06,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,
2,3,2017,Rajkot,2017-04-07,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,


In [None]:
df_deliveries = pd.read_csv("deliveries.csv")
df_deliveries.head(3)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,,,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,,,
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,4,0,4,,,


In [None]:
print(df_matches.shape)
print(df_deliveries.shape)

(756, 18)
(179078, 21)


In [None]:
#plotting every team wins in ipl
import matplotlib.pyplot as plt
import plotly.graph_objs as go
df_winner_matches = df_matches[['id','winner']].groupby('winner').size().reset_index()
df_winner_matches.columns = ['winner','total_wins']
df_winner_matches = df_winner_matches.sort_values('total_wins',ascending=False)

data = go.Bar(x = df_winner_matches['winner'],y = df_winner_matches['total_wins'])
layout = go.Layout(title = 'Total wins by each team in IPL',xaxis = dict(title = 'Team'),yaxis = dict(title = 'No of wins'))
fig = go.Figure(data = data,layout = layout)
fig.show()

In [None]:
print(df_matches.columns)
print(df_deliveries.columns)

Index(['id', 'season', 'city', 'date', 'team1', 'team2', 'toss_winner',
       'toss_decision', 'result', 'dl_applied', 'winner', 'win_by_runs',
       'win_by_wickets', 'player_of_match', 'venue', 'umpire1', 'umpire2',
       'umpire3'],
      dtype='object')
Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batsman', 'non_striker', 'bowler', 'is_super_over', 'wide_runs',
       'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs',
       'batsman_runs', 'extra_runs', 'total_runs', 'player_dismissed',
       'dismissal_kind', 'fielder'],
      dtype='object')


In matches data frame columns like umpire1,umpire2,umpire3 are not useful because these days they are using technology. So we cant decide which umpire is more useful for predicting anything related to match.

Date is also not useful because we already have season column which is also almost equal to date.

City and venue also not important features for this year IPL. Because IPL2020 is happening in UAE, only 2014 ipl was happened in UAE that to half season. But i am removing only venue it almost equal to city feature.

'Result' feature is almost equal to 'winner' feature. So i am dropping result column also.

In [None]:
df_matches = df_matches.drop(['venue','date','result','umpire1','umpire2','umpire3'], axis = 1) 
df_matches.head(2)

Unnamed: 0,id,season,city,team1,team2,toss_winner,toss_decision,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match
0,1,2017,Hyderabad,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,0,Sunrisers Hyderabad,35,0,Yuvraj Singh
1,2,2017,Pune,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,0,Rising Pune Supergiant,0,7,SPD Smith


In [None]:
print(df_matches["team1"].value_counts())

Mumbai Indians                 101
Kings XI Punjab                 91
Chennai Super Kings             89
Royal Challengers Bangalore     85
Kolkata Knight Riders           83
Delhi Daredevils                72
Rajasthan Royals                67
Sunrisers Hyderabad             63
Deccan Chargers                 43
Pune Warriors                   20
Gujarat Lions                   14
Rising Pune Supergiant           8
Rising Pune Supergiants          7
Kochi Tuskers Kerala             7
Delhi Capitals                   6
Name: team1, dtype: int64


In [None]:
#In 2019 delhi daredevils name changed to delhi capitals. So i am replacing old name with new name for all the rows.
df_matches = df_matches.replace(to_replace ="Delhi Daredevils", value = "Delhi Capitals") 

df_deliveries = df_deliveries.replace(to_replace = "Delhi Daredevils", value = "Delhi Capitals")

#Rising pune supergiants name also printed in two ways. Lets change it to Rising pune supergiants.
df_matches = df_matches.replace(to_replace ="Rising Pune Supergiant", value = "Rising Pune Supergiants") 

df_deliveries = df_deliveries.replace(to_replace = "Rising Pune Supergiant", value = "Rising Pune Supergiants")


In [None]:
#checking null values in matches dataset
print(df_matches.isnull().sum())

id                 0
season             0
city               7
team1              0
team2              0
toss_winner        0
toss_decision      0
dl_applied         0
winner             4
win_by_runs        0
win_by_wickets     0
player_of_match    4
dtype: int64


In [None]:
#printing null rows in matches dataset
df1_null = df_matches[df_matches.isna().any(axis=1)]
df1_null.head()

Unnamed: 0,id,season,city,team1,team2,toss_winner,toss_decision,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match
300,301,2011,Delhi,Delhi Capitals,Pune Warriors,Delhi Capitals,bat,0,,0,0,
461,462,2014,,Mumbai Indians,Royal Challengers Bangalore,Royal Challengers Bangalore,field,0,Royal Challengers Bangalore,0,7,PA Patel
462,463,2014,,Kolkata Knight Riders,Delhi Capitals,Kolkata Knight Riders,bat,0,Delhi Capitals,0,4,JP Duminy
466,467,2014,,Chennai Super Kings,Rajasthan Royals,Rajasthan Royals,field,0,Chennai Super Kings,7,0,RA Jadeja
468,469,2014,,Sunrisers Hyderabad,Delhi Capitals,Sunrisers Hyderabad,bat,0,Sunrisers Hyderabad,4,0,AJ Finch


In [None]:
#After seeing above table we can clearly understood that 4 matches are not played beacuse of rain, badlight or pitch condition. 
#So i am replacing those null values in winner column with no result and palyer of the match column null values with not declared.
df_matches["winner"].fillna("No result", inplace = True)
df_matches['player_of_match'].fillna("Not declared", inplace = True)
#city column also has 7 null values. I checked those matches data in Espn website and they played in Dubai.
df_matches["city"].fillna("Dubai", inplace = True)
print(df_matches.shape)

(756, 12)


In [None]:
#checking null values in deliveries dataset
print(df_deliveries.isnull().sum())

match_id                 0
inning                   0
batting_team             0
bowling_team             0
over                     0
ball                     0
batsman                  0
non_striker              0
bowler                   0
is_super_over            0
wide_runs                0
bye_runs                 0
legbye_runs              0
noball_runs              0
penalty_runs             0
batsman_runs             0
extra_runs               0
total_runs               0
player_dismissed    170244
dismissal_kind      170244
fielder             172630
dtype: int64


Player_dismissed, dismissal_kind and fielder features are having null values. But if you consider today questions only 'player_dismissed' and 'dismissal_kind'  features are useful for today analysis. So i am removing remaining two features.

Filling player_dismissed null values with 'No wicket', didmissal_Kind null values with also No Wicket 

In [None]:
#dropping fielder column
df_deliveries = df_deliveries.drop(['fielder'], axis = 1) 
#filling null values
df_deliveries["player_dismissed"].fillna("No Wicket", inplace = True)
df_deliveries["dismissal_kind"].fillna("No Wicket", inplace = True)
df_deliveries.head(2)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,No Wicket,No Wicket
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,No Wicket,No Wicket


### Exploratory Data Analysis(EDA)

### Q1) Who will win MI or KXIP?

In [None]:
# Lets check winning ratio between MI VS KXIP
df_MI_KXIP = df_matches[((df_matches['team1'] == 'Kings XI Punjab') | (df_matches['team1'] == 'Mumbai Indians')) & 
                         ((df_matches['team2'] == 'Kings XI Punjab') | (df_matches['team2'] == 'Mumbai Indians'))] 

#plotting MI and KXIP wins against each other
import plotly.graph_objs as go
df_winner = df_MI_KXIP[['id','winner']].groupby('winner').size().reset_index()
df_winner.columns = ['winner','total_wins']
df_winner = df_winner.sort_values('total_wins',ascending=False)[:2]

data = go.Bar(x = df_winner['winner'],y = df_winner['total_wins'])
layout = go.Layout(title = 'Number of wins by MI and KXIP against each other',xaxis = dict(title = 'Team'),yaxis = dict(title = 'No of wins'))
fig = go.Figure(data = data,layout = layout)
fig.show() 

Mumbai won 13 matches and Punjab won 11 matches. So their head-head record is almost same. 

Lets look at how many man of match awrads won current MI and KXIP players. So that we can decide which team has more match winners.

In [None]:
#Lets look at how many player of the match awards won by current RCB and KXIP players.
#Note: I am taking top 8 palyers from both teams
#Sometimes team win dependent on Number of match winners in your team. i.e why i am using this approach.
#Note: I am taking these player names because these players are played atleast one match for their respective sides in this season till now.
df_man_of_match_RCB = df_matches[(df_matches['player_of_match'] == 'RG Sharma') | (df_matches['player_of_match'] == 'Q de Kock') |
                                (df_matches['player_of_match'] == 'HH Pandya') | (df_matches['player_of_match'] == 'KA Pollard	') |
                                (df_matches['player_of_match'] == 'JJ Bumrah') | (df_matches['player_of_match'] == 'TA Boult') |
                                (df_matches['player_of_match'] == 'RD Chahar') | (df_matches['player_of_match'] == 'AS Yadav')]

print("Total number of man of matches won by present Mumbai players is", df_man_of_match_RCB.shape[0])

df_man_of_match_KXIP = df_matches[(df_matches['player_of_match'] == 'KL Rahul') | (df_matches['player_of_match'] == 'MA Agarwal') |
                                (df_matches['player_of_match'] == 'GJ Maxwell') | (df_matches['player_of_match'] == 'CH Gayle') |
                                (df_matches['player_of_match'] == 'Mohammed Shami') | (df_matches['player_of_match'] == 'Mandeep Singh') |
                                (df_matches['player_of_match'] == 'CJ Jordan') | (df_matches['player_of_match'] == 'M Ashwin') ]
                                
print("Total number of man of matches won by Punjab players is", df_man_of_match_KXIP.shape[0])

Total number of man of matches won by present Mumbai players is 32
Total number of man of matches won by Punjab players is 36


**Conclusion :-**

KXIP players won more man of the match awards than MI. That means KXIP has more match winners.

 So i am predicting KXIP will win today match.

### Q2) To what type of bowler Quinton de kock get out to?

In [46]:
# Lets filter out quinton decock wicket against bowlers.
df_Decock = df_deliveries[df_deliveries['player_dismissed'] == 'Q de Kock']

#Printing top 6 bowlers who got decock wicket most times
df_Decock['bowler'].value_counts()[:6]

MJ McClenaghan    3
YS Chahal         3
AB Dinda          2
DL Chahar         2
KV Sharma         2
AD Russell        2
Name: bowler, dtype: int64

So, if you look at top 6 bowlers he got out to most of the times are seamer/fast bowler(MJ McClenaghan, AB Dinda, DL Chahar, AD Russell).

Lets look at which present KXIP bolwer got out Quinton decock most of the times.

In [47]:
df_Decock_kxipbowlers = df_deliveries[(df_deliveries['player_dismissed'] == 'Q de Kock') &

                                      ((df_deliveries['bowler'] == 'Mohammed Shami') | (df_deliveries['bowler'] == 'Mohammed Shami') |
                                       (df_deliveries['bowler'] == 'M Ashwin') | (df_deliveries['bowler'] == 'GJ Maxwell'))]

df_Decock_kxipbowlers.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind
166772,11145,1,Mumbai Indians,Kings XI Punjab,13,7,Q de Kock,Yuvraj Singh,Mohammed Shami,0,0,0,0,0,0,0,0,0,Q de Kock,lbw


**Conclusion**

If you look at above observation only Mohammad shami got the decock wicket in present KXIP bowlers. 

Mohmmad shami is a fast bowler and also if you look at over all dismissals of decock he got out most of the times to fast bowler only. 

I am predicting he will got out to Seam/Fast

### Q3) How many wickets will KXIP will loose in the powerplay?

In [57]:
#Lets filter out powerplay overs of KXIP team
df_KXIP_PP = df_deliveries[(df_deliveries['batting_team'] == 'Kings XI Punjab') & (df_deliveries['over'] <= 6 )]

no_of_wickets = len(df_KXIP_PP[df_KXIP_PP['player_dismissed'] != 'No Wicket'])
no_of_matches = df_KXIP_PP['match_id'].nunique()
avg_no_of_wickets = no_of_wickets/no_of_matches

print(no_of_wickets, "wickets lost by KXIP in powerplay(1-6overs) in", no_of_matches, "matches")
print("KXIP lost ",int(avg_no_of_wickets)," wicket per match in powerplay")

268 wickets lost by KXIP in powerplay(1-6overs) in 176 matches
KXIP lost  1  wicket per match in powerplay


In [60]:
#Lets look at how many wickets took by MI bowlers in powerplay
#Lets filter out powerplay overs of KXIP team
df_MI_PP = df_deliveries[(df_deliveries['bowling_team'] == 'Kings XI Punjab') & (df_deliveries['over'] <= 6 )]

no_of_wickets = len(df_MI_PP[df_MI_PP['player_dismissed'] != 'No Wicket'])
no_of_matches = df_MI_PP['match_id'].nunique()
avg_noof_wickets = no_of_wickets/no_of_matches

print(no_of_wickets, "wickets taken by MI bowlers in powerplay(1-6overs) in", no_of_matches, "matches")
print("MI taken ",int(avg_no_of_wickets)," wicket per match in powerplay")

241 wickets taken by MI bowlers in powerplay(1-6overs) in 176 matches
MI taken  1  wicket per match in powerplay


**Conclusion :-**

By looking at above two analysis i am predicting KXIP will loose (0-1) wicket in powerplay. 

### Q4) How will KL Rahul get out?

In [72]:
#filtering out KL rahul from player dismissed
df_Rahul = df_deliveries[(df_deliveries['player_dismissed'] == 'KL Rahul')]
df_Rahul_total_matches = df_deliveries[df_deliveries['batsman'] == "KL Rahul"]

print(df_Rahul_total_matches['match_id'].nunique(), " number of times he got the batting.")
print("he got out below number of times")
df_Rahul['dismissal_kind'].value_counts()

58  number of times he got the batting.
he got out below number of times


caught     30
bowled     10
lbw         4
stumped     2
run out     1
Name: dismissal_kind, dtype: int64

According to above analysis he caught out most of the times(30),he remained notout(11) times and he bowled 10 times.

I am predicting KL Rahul get out by Caught.

### Q5) How many wickets will fall in during the match?

In [81]:
#Lets filter out MI VS KXIP matches 
df_MI_KXIP = df_deliveries[((df_deliveries['bowling_team'] == "Kings XI Punjab") & (df_deliveries['batting_team'] == "Mumbai Indians"))
                                                                                    |
                                  ((df_deliveries['bowling_team'] == "Mumbai Indians") & (df_deliveries['batting_team'] == "Kings XI Punjab"))]
df_wickets.head(2)
#Calculating number of wickets lost in all matches
no_of_wickets = len(df_MI_KXIP[df_MI_KXIP['player_dismissed'] != 'No Wicket'])
no_of_mathes = df_MI_KXIP['match_id'].nunique()
avg_no_of_wickets = no_of_wickets/no_of_mathes

print(no_of_wickets, "wickets lost by KXIP and MI teams in", no_of_mathes," MI VS KXIP matches")
print("They lost",int(avg_no_of_wickets), "wickets per match" )

294 wickets lost by KXIP and MI teams in 24  MI VS KXIP matches
They lost 12 wickets per match


In [84]:
#Lets look individual bowler record also in MI VS KXIP matches
#Note: Taking only present 6 bowlers(4 from MI and 2from KXIP) beacuse remaining bowlers are not played much IPL before this season
df_bowlers = df_MI_KXIP[(df_MI_KXIP['bowler'] == 'JJ Bumrah') | (df_MI_KXIP['bowler'] == 'TA Boult') |
                        (df_MI_KXIP['bowler'] == 'RD Chahar') | (df_MI_KXIP['bowler'] == 'KH Pandya') |
                        (df_MI_KXIP['bowler'] == 'Mohammed Shami') | (df_MI_KXIP['bowler'] == 'M Ashwin')]

#Calculating number of wickets taken by present MI and KXP bowlers in MI VS KXIP matches
no_of_wickets = len(df_bowlers[df_bowlers['player_dismissed'] != 'No Wicket'])
no_of_mathes = df_bowlers['match_id'].nunique()
avg_no_of_wickets = no_of_wickets/no_of_mathes

print(no_of_wickets, "wickets taken by present 6 KXIP and MI team bowlers in", no_of_mathes," MI VS KXIP matches")

24 wickets taken by present 6 KXIP and MI team bowlers in 10  MI VS KXIP matches


**Conclusion :-**

24 wickets in 10 matches by 6 bowlers is very less number of wickets. But if you consider first analysis they lost 12 wickets per match.

So i am predicting they will loose 6-10 wickets in match.