In [1]:
import pandas as pd
df_matches = pd.read_csv("matches.csv")
df_matches.head(3)

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,2,2017,Pune,2017-04-06,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,
2,3,2017,Rajkot,2017-04-07,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,


In [2]:
df_deliveries = pd.read_csv("deliveries.csv")
df_deliveries.head(3)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,,,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,,,
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,4,0,4,,,


In [3]:
print(df_matches.shape)
print(df_deliveries.shape)

(756, 18)
(179078, 21)


In [4]:
#plotting every team wins. 
import matplotlib.pyplot as plt
import plotly.graph_objs as go
df_winner_matches = df_matches[['id','winner']].groupby('winner').size().reset_index()
df_winner_matches.columns = ['winner','total_wins']
df_winner_matches = df_winner_matches.sort_values('total_wins',ascending=False)

data = go.Bar(x = df_winner_matches['winner'],y = df_winner_matches['total_wins'])
layout = go.Layout(title = 'Total wins by each team in IPL',xaxis = dict(title = 'Team'),yaxis = dict(title = 'No of wins'))
fig = go.Figure(data = data,layout = layout)
fig.show()

In [5]:
print(df_matches.columns)
print(df_deliveries.columns)

Index(['id', 'season', 'city', 'date', 'team1', 'team2', 'toss_winner',
       'toss_decision', 'result', 'dl_applied', 'winner', 'win_by_runs',
       'win_by_wickets', 'player_of_match', 'venue', 'umpire1', 'umpire2',
       'umpire3'],
      dtype='object')
Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batsman', 'non_striker', 'bowler', 'is_super_over', 'wide_runs',
       'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs',
       'batsman_runs', 'extra_runs', 'total_runs', 'player_dismissed',
       'dismissal_kind', 'fielder'],
      dtype='object')


In matches data frame columns like umpire1,umpire2,umpire3 are not useful because these days they are using technology. So we cant decide which umpire is more useful for predicting anything related to match.

Date is also not useful because we already have season column which is also almost equal to date.

City and venue also not important features for this year IPL. Because IPL2020 is happening in UAE, only 2014 ipl was happened in UAE that to half season. I am removing Venue column and keeping City column.

In [6]:
df_matches = df_matches.drop(['venue','date','umpire1','umpire2','umpire3'], axis = 1) 
df_matches.head(2)

Unnamed: 0,id,season,city,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match
0,1,2017,Hyderabad,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh
1,2,2017,Pune,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith


In [7]:
print(df_matches["team2"].value_counts())

Kolkata Knight Riders          95
Royal Challengers Bangalore    95
Delhi Daredevils               89
Mumbai Indians                 86
Kings XI Punjab                85
Rajasthan Royals               80
Chennai Super Kings            75
Sunrisers Hyderabad            45
Deccan Chargers                32
Pune Warriors                  26
Gujarat Lions                  16
Delhi Capitals                 10
Rising Pune Supergiant          8
Rising Pune Supergiants         7
Kochi Tuskers Kerala            7
Name: team2, dtype: int64


In [8]:
#In 2019 delhi daredevils name changed to delhi capitals. So i am replacing old name with new name for all the rows.
df_matches = df_matches.replace(to_replace ="Delhi Daredevils", value = "Delhi Capitals") 

df_deliveries = df_deliveries.replace(to_replace = "Delhi Daredevils", value = "Delhi Capitals")

#RPS name also printed wrongly, So i am changing that as well
df_matches = df_matches.replace(to_replace ="Rising Pune Supergiant", value = "Rising Pune Supergiants") 

df_deliveries = df_deliveries.replace(to_replace = "Rising Pune Supergiant", value = "Rising Pune Supergiants ")

In [9]:
#checking null values in matches dataset
print(df_matches.isnull().sum())

id                 0
season             0
city               7
team1              0
team2              0
toss_winner        0
toss_decision      0
result             0
dl_applied         0
winner             4
win_by_runs        0
win_by_wickets     0
player_of_match    4
dtype: int64


In [10]:
#printing null rows in matches dataset
df1_null = df_matches[df_matches.isna().any(axis=1)]
df1_null.head()

Unnamed: 0,id,season,city,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match
300,301,2011,Delhi,Delhi Capitals,Pune Warriors,Delhi Capitals,bat,no result,0,,0,0,
461,462,2014,,Mumbai Indians,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Royal Challengers Bangalore,0,7,PA Patel
462,463,2014,,Kolkata Knight Riders,Delhi Capitals,Kolkata Knight Riders,bat,normal,0,Delhi Capitals,0,4,JP Duminy
466,467,2014,,Chennai Super Kings,Rajasthan Royals,Rajasthan Royals,field,normal,0,Chennai Super Kings,7,0,RA Jadeja
468,469,2014,,Sunrisers Hyderabad,Delhi Capitals,Sunrisers Hyderabad,bat,normal,0,Sunrisers Hyderabad,4,0,AJ Finch


In [11]:
#After seeing above table we can clearly understood that 4 matches are not played beacuse of rain, badlight or pitch condition. 
#So i am replacing those null values in winner column with no result and palyer of the match column null values with not declared.
df_matches["winner"].fillna("No result", inplace = True)
df_matches['player_of_match'].fillna("Not declared", inplace = True)
#city column also has 7 null values. I checked those matches data in Espn website and they played in Dubai.
df_matches["city"].fillna("Dubai", inplace = True)
print(df_matches.shape)

(756, 13)


In [12]:
#checking null values in deliveries dataset
print(df_deliveries.isnull().sum())

match_id                 0
inning                   0
batting_team             0
bowling_team             0
over                     0
ball                     0
batsman                  0
non_striker              0
bowler                   0
is_super_over            0
wide_runs                0
bye_runs                 0
legbye_runs              0
noball_runs              0
penalty_runs             0
batsman_runs             0
extra_runs               0
total_runs               0
player_dismissed    170244
dismissal_kind      170244
fielder             172630
dtype: int64


Player_dismissed, dismissal_kind and fielder features are having null values. But if you consider today questions only 'player_dismissed' and 'dismissal_kind' features are useful for today analysis. So i am removing remaining two features.

Filling player_dismissed null values with 'No wicket', didmissal_Kind null values with also No Wicket

In [13]:
#dropping fielder column
df_deliveries = df_deliveries.drop(['fielder'], axis = 1) 
#filling null values
df_deliveries["player_dismissed"].fillna("No Wicket", inplace = True)
df_deliveries["dismissal_kind"].fillna("No Wicket", inplace = True)
df_deliveries.head(2)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,wide_runs,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,No Wicket,No Wicket
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,0,0,0,0,0,0,0,0,No Wicket,No Wicket


### Exploratory Data Analysis

### Q1) Who will win the match RR VS CSK?

In [14]:
# Lets check winning ratio between CSK VS RR
df_MI_KXIP = df_matches[((df_matches['team1'] == 'Chennai Super Kings') | (df_matches['team1'] == 'Rajasthan Royals')) & 
                         ((df_matches['team2'] == 'Chennai Super Kings') | (df_matches['team2'] == 'Rajasthan Royals'))] 

#plotting MI and KXIP wins against each other
import plotly.graph_objs as go
df_winner = df_MI_KXIP[['id','winner']].groupby('winner').size().reset_index()
df_winner.columns = ['winner','total_wins']
df_winner = df_winner.sort_values('total_wins',ascending=False)[:2]

data = go.Bar(x = df_winner['winner'],y = df_winner['total_wins'])
layout = go.Layout(title = 'Number of wins by CSK and RR against each other',xaxis = dict(title = 'Team'),yaxis = dict(title = 'No of wins'))
fig = go.Figure(data = data,layout = layout)
fig.show() 

CSK won more matches than RR in head-head matches but this is not enough to predict todays match results. Because players will change every 2 years. So its better look individual player runs or wickets against each other.

In [17]:
df_RRbatsman_CSKbowlers = df_deliveries[((df_deliveries['batsman'] == 'JC Buttler') | (df_deliveries['batsman'] == 'SPD Smith') |
                             (df_deliveries['batsman'] == 'SV Samson') | (df_deliveries['bowler'] == 'DL Chahar') | 
                             (df_deliveries['bowler'] == 'SN Thakur') | (df_deliveries['bowler'] == 'RA Jadeja'))] 

print("RR top 3 batsman runs against CSK bowlers : " , df_RRbatsman_CSKbowlers['batsman_runs'].sum())
RR_bat_average = df_RRbatsman_CSKbowlers['batsman_runs'].sum()/len(df_RRbatsman_CSKbowlers[(df_RRbatsman_CSKbowlers['player_dismissed'] == 'JC Buttler') |
                                                                                              (df_RRbatsman_CSKbowlers['player_dismissed'] == 'SPD Smith')|
                                                                                              (df_RRbatsman_CSKbowlers['player_dismissed'] == 'SV Samson')])
print("RR batsman batting average against CSK bowlers : ",RR_bat_average)

df_CSKbatsman_RRbowlers= df_deliveries[((df_deliveries['batsman'] == 'SR Watson') | (df_deliveries['batsman'] == 'F du Plessis') |
                             (df_deliveries['batsman'] == 'MS Dhoni') | (df_deliveries['bowler'] == 'J Archer') |
                             (df_deliveries['bowler'] == 'S Gopal') | (df_deliveries['bowler'] == 'BA Stokes'))]

print("CSK top 3 batsman runs against RR bowlers : " , df_CSKbatsman_RRbowlers['batsman_runs'].sum())
CSK_bat_average = df_CSKbatsman_RRbowlers['batsman_runs'].sum()/len(df_CSKbatsman_RRbowlers[(df_CSKbatsman_RRbowlers['player_dismissed'] == 'SR Watson') |
                                                                                              (df_CSKbatsman_RRbowlers['player_dismissed'] == 'F du Plessis')|
                                                                                              (df_CSKbatsman_RRbowlers['player_dismissed'] == 'MS Dhoni')])
print("CSK batsman batting average against RR bowlers : ",CSK_bat_average)

RR top 3 batsman runs against CSK bowlers :  10509
RR batsman batting average against CSK bowlers :  62.55357142857143
CSK top 3 batsman runs against RR bowlers :  12031
CSK batsman batting average against RR bowlers :  44.39483394833948


**Conclusion :-**

By looking at above analysis RR batsman has uper hand against current CSK bowlers. So i am predicting RR will win today match.

### Q2) How many runs will RR Score in the first 6 overs?

In [18]:
#Actually top 3 batsman(i.e openers and onedown batsman) will face more deliveries in powerplay.
#So i am picking top3 batsman from Rajasthan royals team(i.e Buttler, Stokes, Samson).
df_top3_batsman_RR = df_deliveries[(df_deliveries['batsman'] == 'JC Buttler') | (df_deliveries['batsman'] == 'RV Uthappa') |
                                   (df_deliveries['batsman'] == 'SPD Smith')]
df_top3_batsman_RR.head()

#Now we are calculating runs scored in powerplay i.e first 6 overs by above 3 batsman. 
#So for that we can filter out 1 to 6 overs from 'over' feature/column.
df_top3_powerplay_RR = df_top3_batsman_RR[(df_top3_batsman_RR['over'] <= 6)]
                                          


In [22]:
#Now we can calcualte the batting average of RR top 3 batsman in powerplay

df_powerpaly_avg_RR = df_top3_powerplay_RR["batsman_runs"].sum() /len(df_top3_powerplay_RR[(df_top3_powerplay_RR['player_dismissed'] == 'JC Buttler') |
                                                                      (df_top3_powerplay_RR['player_dismissed'] == 'RV Uthappa')|
                                                                      (df_top3_powerplay_RR['player_dismissed'] == 'SPD Smith')])

print("Average runs scored by current Rajasthan Royals top 3 batsman in powerplay",int(df_powerpaly_avg_RR))

Average runs scored by current Rajasthan Royals top 3 batsman in powerplay 41


In [23]:
#Lets calculate avg runs per match in powerplay by RR team as well
df_powerplay_RR = df_deliveries[(df_deliveries['over'] <=6) & (df_deliveries['batting_team'] == "Rajasthan Royals")]
total_runs = df_powerplay_RR["batsman_runs"].sum()
no_of_matches = df_powerplay_RR['match_id'].nunique()
avg_runs_per_match = total_runs/no_of_matches
print("Average runs scored by RR team in powerplay is",int(avg_runs_per_match))

Average runs scored by RR team in powerplay is 42


**Conclusion :-**

By looking at above two analysis i am predicting RR will score (40-49) runs in powerplay.

### Q3) What will Deepak Chahar Economay rate?

In [42]:
df_Chahar = df_deliveries[(df_deliveries['bowler'] == 'DL Chahar')]
#For calculating economy rate we need number of overs bowled by Chahar
#So what i am doing was  first i am calculating number of balls bowled by deepak chahar then converting to Overs
#Note: In balls column we have numbers like 7 and 8 but we dont require that bacuase we want overs. 
#Each over contain 6 balls. So i.e why i am taking condition balls<=6.
chahar_balls = len(df_Chahar[df_Chahar['ball'] <= 6])
chahar_overs = chahar_balls/6
print("Number of overs bowled by Deepak Chahar in IPL is",chahar_overs)

#Now calcualte number of runs given by chahar
#Wide,No-ball,penalty and batsman runs comes under runs given by bowler.
wide_runs = df_Chahar['wide_runs'].sum()
noball_runs = df_Chahar['noball_runs'].sum()
penalty_runs = df_Chahar['penalty_runs'].sum()
batsman_runs = df_Chahar['batsman_runs'].sum()
chahar_runs = wide_runs + noball_runs + penalty_runs + batsman_runs
print("Number of runs given by Deepak Chahar in IPL is",chahar_runs)

Number of overs bowled by Deepak Chahar in IPL is 115.0
Number of runs given by Deepak Chahar in IPL is 914


In [44]:
#Now calculate the economy rate of chahar in ipl
economy_rate_chahar = chahar_runs/chahar_overs
print("Deepak Chahar ecoenomy rate in ipl is",economy_rate_chahar)

Deepak Chahar ecoenomy rate in ipl is 7.947826086956522


In [50]:
#Lets calculate his economy rate against Rajasthan royals.
df_Chahar_against_RR = df_deliveries[(df_deliveries['bowler'] == 'DL Chahar') & (df_deliveries['batting_team'] == 'Rajasthan Royals') ]
df_Chahar_against_RR.head()

chahar_balls = len(df_Chahar_against_RR[df_Chahar_against_RR['ball'] <= 6])
chahar_overs = chahar_balls/6
print("Number of overs bowled by Deepak Chahar against RR in IPL is",chahar_overs)

#Now calcualte number of runs given by chahar
#Wide,No-ball,penalty and batsman runs comes under runs given by bowler.
wide_runs = df_Chahar_against_RR['wide_runs'].sum()
noball_runs = df_Chahar_against_RR['noball_runs'].sum()
penalty_runs = df_Chahar_against_RR['penalty_runs'].sum()
batsman_runs = df_Chahar_against_RR['batsman_runs'].sum()
chahar_runs = wide_runs + noball_runs + penalty_runs + batsman_runs
print("Number of runs given by Deepak Chahar against RR in IPL is",chahar_runs,"\n")

#Now calculate the economy rate of chahar in ipl
economy_rate_chahar = chahar_runs/chahar_overs
print("Deepak Chahar ecoenomy rate against RR in ipl is",economy_rate_chahar)

Number of overs bowled by Deepak Chahar against RR in IPL is 12.0
Number of runs given by Deepak Chahar against RR in IPL is 88 

Deepak Chahar ecoenomy rate against RR in ipl is 7.333333333333333


**Conclusion :-**

Deepak chahar economy rate in IPL is 7.9 and against RR is 7.33. So i am predicting in today match his economy rate will be in the range of (7.31 - 8.2) according to above analysis.

### Q4) How many wickets will jofra archer will take in the death?

As per my cricket knowledge Jofra archer is playing ipl from 2018 only. So we have very less data related to archer against CSK. i.e why i am taking all the matches of jofra in ipl.

In [66]:
#First filter out death overs (16-20) and bowler jofra archer.
df_deathover_archer = df_deliveries[(df_deliveries['bowler'] == 'J Archer') & (df_deliveries['over'] >= 16)]
                                    

df_deathover_archer.head()
archer_total_wickets = len(df_deathover_archer[df_deathover_archer['player_dismissed'] != "No Wicket"])
no_of_matches = df_deathover_archer['match_id'].nunique()
print("In",no_of_matches,"matches Jofra Archer bowled in death overs")
print("Jofra archer taken",archer_total_wickets, "wickets in death overs")


In 20 matches Jofra Archer bowled in death overs
Jofra archer taken 14 wickets in death overs


**Conclusion :-**

By looking at above analysis he taken almost 1 wicket per match in death overs.

So i am predicting Jofra Archer will take 1 wicket in death overs. 

### Q5) What will Steve Smith Strike rate be during the match?

In [83]:
#Lets calculate Smith strike rate against current CSK bowlers.
df_smith_against_CSK = df_deliveries[(df_deliveries['batsman'] == 'SPD Smith') & 
                         ((df_deliveries['bowler'] == 'DL Chahar') | (df_deliveries['bowler'] == 'DJ Bravo') |
                         (df_deliveries['bowler'] == 'SN Thakur')  | (df_deliveries['bowler'] == 'RA Jadeja'))]                                           

#Number of runs scored by smith against current CSk bowlers
total_runs = df_smith_against_CSK['batsman_runs'].sum()
#Number of balls faced by smith against current CSK bowlers
#Here wide balls are not counted under batsman faced balls. We can count No-balls under batsman faced deliveries
no_of_balls = len(df_smith_against_CSK['ball']) - len(df_smith_against_CSK[df_smith_against_CSK['wide_runs'] >= 1])

print("Steve Smith Scored",total_runs, "runs aginst current CSK bowlers in",no_of_balls,"balls")


Steve Smith Scored 135 runs aginst current CSK bowlers in 113 balls


In [87]:
Strike_rate = (total_runs/no_of_balls)*100
print("Steve Smith Strike rate against current CSK bowlers is",int(Strike_rate))

Steve Smith Strike rate against current CSK bowlers is 119


**Conclusion :-**

According to above analysis, I am predicting Steve smith strike in today match is less than 120.