In [2]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
matches = pd.read_excel('matches.xlsx')
deliveries = pd.read_excel('deliveries.xlsx')
matches.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,winner,win_by_runs,win_by_wickets,player_of_match,venue
0,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal"
1,2,2017,Pune,2017-04-06,Rising Pune Supergiant,Mumbai Indians,Rising Pune Supergiant,field,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium
2,3,2017,Rajkot,2017-04-07,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium
3,4,2017,Indore,2017-04-08,Kings XI Punjab,Rising Pune Supergiant,Kings XI Punjab,field,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium
4,5,2017,Bengaluru,2017-04-08,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,Royal Challengers Bangalore,15,0,KM Jadhav,M.Chinnaswamy Stadium


In [5]:
matches.shape,deliveries.shape

((1024, 13), (243817, 12))

In [6]:
deliveries

Unnamed: 0,Match_id,Inning Number,Batting_team,Bowling_team,Over,Ball,Batsman,Non-Striker,Bowler,is_super_over,player_dismissed,Total
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,0,1,DA Warner,S Dhawan,TS Mills,0,,0
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,0,2,DA Warner,S Dhawan,TS Mills,0,,0
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,0,3,DA Warner,S Dhawan,TS Mills,0,,4
3,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,0,4,DA Warner,S Dhawan,TS Mills,0,,0
4,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,0,5,DA Warner,S Dhawan,TS Mills,0,,2
...,...,...,...,...,...,...,...,...,...,...,...,...
243812,1024,2,Royal Challengers Bangalore,Sunrisers Hyderabad,19,2,Sachin Baby,CJ Jordan,B Kumar,0,,2
243813,1024,2,Royal Challengers Bangalore,Sunrisers Hyderabad,19,3,Sachin Baby,CJ Jordan,B Kumar,0,CJ Jordan,0
243814,1024,2,Royal Challengers Bangalore,Sunrisers Hyderabad,19,4,Iqbal Abdulla,Sachin Baby,B Kumar,0,,1
243815,1024,2,Royal Challengers Bangalore,Sunrisers Hyderabad,19,5,Sachin Baby,Iqbal Abdulla,B Kumar,0,,1


In [7]:
deliveries.columns

Index(['Match_id', 'Inning Number', 'Batting_team', 'Bowling_team', 'Over',
       'Ball', 'Batsman', 'Non-Striker', 'Bowler', 'is_super_over',
       'player_dismissed', 'Total'],
      dtype='object')

In [8]:
# grouping the 1st innings,2nd innings score in a particular matchid
# lets say match id = 1,so inning 1 score = 207,inning 2 score = 172,in that way

totalrun_df = deliveries.groupby(['Match_id','Inning Number']).sum()['Total'].reset_index()
totalrun_df

Unnamed: 0,Match_id,Inning Number,Total
0,1,1,207
1,1,2,172
2,2,1,184
3,2,2,187
4,3,1,183
...,...,...,...
2070,1022,2,140
2071,1023,1,162
2072,1023,2,163
2073,1024,1,208


In [9]:
# capturing only the first innings,as we will be predicting for the second innnigs

totalrun_df = totalrun_df[totalrun_df['Inning Number']==1]
totalrun_df['Total'] = totalrun_df['Total'].apply(lambda x:x+1)
totalrun_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  totalrun_df['Total'] = totalrun_df['Total'].apply(lambda x:x+1)


Unnamed: 0,Match_id,Inning Number,Total
0,1,1,208
2,2,1,185
4,3,1,184
6,4,1,164
8,5,1,158
...,...,...,...
2065,1020,1,139
2067,1021,1,159
2069,1022,1,163
2071,1023,1,163


In [10]:
matches.columns

Index(['id', 'season', 'city', 'date', 'team1', 'team2', 'toss_winner',
       'toss_decision', 'winner', 'win_by_runs', 'win_by_wickets',
       'player_of_match', 'venue'],
      dtype='object')

In [11]:
'''
Merging the total first innings score df with the matches df,
where left side merging is done on "id" column of the matches
and right side merging is done on "match_id" column of the totalrun_df

This is an inner join. The inner join returns only the rows that have matching values in both tables, 
in this case, the 'matches' DataFrame and the 'totalrun_df' DataFrame. 
It returns only the rows where the 'id' column in the "matches" DataFrame has a match in the 'match_id' 
column of the "totalrun_df" DataFrame.

'''

match_df = matches.merge(totalrun_df[['Match_id','Total']],
                       left_on='id',right_on='Match_id')

match_df

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,winner,win_by_runs,win_by_wickets,player_of_match,venue,Match_id,Total
0,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",1,208
1,2,2017,Pune,2017-04-06,Rising Pune Supergiant,Mumbai Indians,Rising Pune Supergiant,field,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,2,185
2,3,2017,Rajkot,2017-04-07,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,3,184
3,4,2017,Indore,2017-04-08,Kings XI Punjab,Rising Pune Supergiant,Kings XI Punjab,field,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,4,164
4,5,2017,Bengaluru,2017-04-08,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,Royal Challengers Bangalore,15,0,KM Jadhav,M.Chinnaswamy Stadium,5,158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019,1020,2016,Raipur,2016-05-22,Delhi Daredevils,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Royal Challengers Bangalore,0,6,V Kohli,Shaheed Veer Narayan Singh International Stadium,1020,139
1020,1021,2016,Bangalore,2016-05-24,Gujarat Lions,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Royal Challengers Bangalore,0,4,AB de Villiers,M Chinnaswamy Stadium,1021,159
1021,1022,2016,Delhi,2016-05-25,Sunrisers Hyderabad,Kolkata Knight Riders,Kolkata Knight Riders,field,Sunrisers Hyderabad,22,0,MC Henriques,Feroz Shah Kotla,1022,163
1022,1023,2016,Delhi,2016-05-27,Gujarat Lions,Sunrisers Hyderabad,Sunrisers Hyderabad,field,Sunrisers Hyderabad,0,4,DA Warner,Feroz Shah Kotla,1023,163


In [12]:
match_df['team1'].unique()

array(['Sunrisers Hyderabad', 'Rising Pune Supergiant', 'Gujarat Lions',
       'Kings XI Punjab', 'Royal Challengers Bangalore', 'Mumbai Indians',
       'Kolkata Knight Riders', 'Delhi Daredevils', 'Rajasthan Royals',
       'Chennai Super Kings', 'Delhi Capitals', 'Punjab Kings',
       'Lucknow Super Giants', 'Gujarat Titans', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Rising Pune Supergiants'],
      dtype=object)

In [13]:
teams = [
    'Sunrisers Hyderabad',
    'Mumbai Indians',
    'Royal Challengers Bangalore',
    'Kolkata Knight Riders',
    'Kings XI Punjab',
    'Chennai Super Kings',
    'Rajasthan Royals',
    'Delhi Capitals',
    'Rising Pune Supergiant',
    'Gujarat Lions',
    'Delhi Daredevils',
]

In [14]:
# replacing the Delhi Daredevils with Delhi Capitals

match_df['team1'] = match_df['team1'].str.replace('Delhi Daredevils','Delhi Capitals')
match_df['team2'] = match_df['team2'].str.replace('Delhi Daredevils','Delhi Capitals')


# replacing the Deccan Chargers with Sunrises Hyderabad

match_df['team1'] = match_df['team1'].str.replace('Deccan Chargers','Sunrisers Hyderabad')
match_df['team2'] = match_df['team2'].str.replace('Deccan Chargers','Sunrisers Hyderabad')



In [15]:
# will consider only frequently occuring teams,
# which are mentioned in the teams list

match_df = match_df[match_df['team1'].isin(teams)]
match_df = match_df[match_df['team2'].isin(teams)]

match_df['team1'].unique()

array(['Sunrisers Hyderabad', 'Rising Pune Supergiant', 'Gujarat Lions',
       'Kings XI Punjab', 'Royal Challengers Bangalore', 'Mumbai Indians',
       'Kolkata Knight Riders', 'Delhi Capitals', 'Rajasthan Royals',
       'Chennai Super Kings'], dtype=object)

In [16]:
match_df.shape

(856, 15)

In [17]:
match_df.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,winner,win_by_runs,win_by_wickets,player_of_match,venue,Match_id,Total
0,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",1,208
1,2,2017,Pune,2017-04-06,Rising Pune Supergiant,Mumbai Indians,Rising Pune Supergiant,field,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,2,185
2,3,2017,Rajkot,2017-04-07,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,3,184
3,4,2017,Indore,2017-04-08,Kings XI Punjab,Rising Pune Supergiant,Kings XI Punjab,field,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,4,164
4,5,2017,Bengaluru,2017-04-08,Royal Challengers Bangalore,Delhi Capitals,Royal Challengers Bangalore,bat,Royal Challengers Bangalore,15,0,KM Jadhav,M.Chinnaswamy Stadium,5,158


In [18]:
deliveries.head(3)

Unnamed: 0,Match_id,Inning Number,Batting_team,Bowling_team,Over,Ball,Batsman,Non-Striker,Bowler,is_super_over,player_dismissed,Total
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,0,1,DA Warner,S Dhawan,TS Mills,0,,0
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,0,2,DA Warner,S Dhawan,TS Mills,0,,0
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,0,3,DA Warner,S Dhawan,TS Mills,0,,4


In [19]:
# merging matchdf with delevieries on match_id

delivery_df = match_df.merge(deliveries,on='Match_id')

delivery_df.head(5)

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,winner,win_by_runs,...,Batting_team,Bowling_team,Over,Ball,Batsman,Non-Striker,Bowler,is_super_over,player_dismissed,Total_y
0,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Sunrisers Hyderabad,Royal Challengers Bangalore,0,1,DA Warner,S Dhawan,TS Mills,0,,0
1,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Sunrisers Hyderabad,Royal Challengers Bangalore,0,2,DA Warner,S Dhawan,TS Mills,0,,0
2,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Sunrisers Hyderabad,Royal Challengers Bangalore,0,3,DA Warner,S Dhawan,TS Mills,0,,4
3,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Sunrisers Hyderabad,Royal Challengers Bangalore,0,4,DA Warner,S Dhawan,TS Mills,0,,0
4,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Sunrisers Hyderabad,Royal Challengers Bangalore,0,5,DA Warner,S Dhawan,TS Mills,0,,2


In [20]:
delivery_df.columns

Index(['id', 'season', 'city', 'date', 'team1', 'team2', 'toss_winner',
       'toss_decision', 'winner', 'win_by_runs', 'win_by_wickets',
       'player_of_match', 'venue', 'Match_id', 'Total_x', 'Inning Number',
       'Batting_team', 'Bowling_team', 'Over', 'Ball', 'Batsman',
       'Non-Striker', 'Bowler', 'is_super_over', 'player_dismissed',
       'Total_y'],
      dtype='object')

In [21]:
# considering the 2nd innings because we have to keep a check on the current score of second innings

delivery_df = delivery_df[delivery_df['Inning Number'] == 2]
delivery_df.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,winner,win_by_runs,...,Batting_team,Bowling_team,Over,Ball,Batsman,Non-Striker,Bowler,is_super_over,player_dismissed,Total_y
125,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Royal Challengers Bangalore,Sunrisers Hyderabad,0,1,CH Gayle,Mandeep Singh,A Nehra,0,,1
126,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Royal Challengers Bangalore,Sunrisers Hyderabad,0,2,Mandeep Singh,CH Gayle,A Nehra,0,,0
127,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Royal Challengers Bangalore,Sunrisers Hyderabad,0,3,Mandeep Singh,CH Gayle,A Nehra,0,,0
128,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Royal Challengers Bangalore,Sunrisers Hyderabad,0,4,Mandeep Singh,CH Gayle,A Nehra,0,,2
129,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Royal Challengers Bangalore,Sunrisers Hyderabad,0,5,Mandeep Singh,CH Gayle,A Nehra,0,,4


In [22]:
delivery_df.shape

(98313, 26)

In [24]:

# current score of particular match

delivery_df['current_score'] = delivery_df.groupby('Match_id')['Total_y'].cumsum()

delivery_df.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,winner,win_by_runs,...,Bowling_team,Over,Ball,Batsman,Non-Striker,Bowler,is_super_over,player_dismissed,Total_y,current_score
125,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Sunrisers Hyderabad,0,1,CH Gayle,Mandeep Singh,A Nehra,0,,1,1
126,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Sunrisers Hyderabad,0,2,Mandeep Singh,CH Gayle,A Nehra,0,,0,1
127,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Sunrisers Hyderabad,0,3,Mandeep Singh,CH Gayle,A Nehra,0,,0,1
128,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Sunrisers Hyderabad,0,4,Mandeep Singh,CH Gayle,A Nehra,0,,2,3
129,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Sunrisers Hyderabad,0,5,Mandeep Singh,CH Gayle,A Nehra,0,,4,7


In [25]:
# runs left 

delivery_df['runs_left'] = delivery_df['Total_x']-delivery_df['current_score']

delivery_df.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,winner,win_by_runs,...,Over,Ball,Batsman,Non-Striker,Bowler,is_super_over,player_dismissed,Total_y,current_score,runs_left
125,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,0,1,CH Gayle,Mandeep Singh,A Nehra,0,,1,1,207
126,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,0,2,Mandeep Singh,CH Gayle,A Nehra,0,,0,1,207
127,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,0,3,Mandeep Singh,CH Gayle,A Nehra,0,,0,1,207
128,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,0,4,Mandeep Singh,CH Gayle,A Nehra,0,,2,3,205
129,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,0,5,Mandeep Singh,CH Gayle,A Nehra,0,,4,7,201


In [26]:
'''
if one ball is played,then balls left = 120-1 = 119
if two balls are played,then balls left = 120-2 = 118

so similarly if over=1,over has 6 balls right,so 1*6 = 6
now,ball = 1,so 6+1 = 7,now 126-7 = 119,which is same as (1)

so we'll use balls_left = 126-(over*6+current_ball)

'''

# balls left


delivery_df['balls_left'] = 120-(delivery_df['Over']*6+delivery_df['Ball'])

delivery_df

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,winner,win_by_runs,...,Ball,Batsman,Non-Striker,Bowler,is_super_over,player_dismissed,Total_y,current_score,runs_left,balls_left
125,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,1,CH Gayle,Mandeep Singh,A Nehra,0,,1,1,207,119
126,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,2,Mandeep Singh,CH Gayle,A Nehra,0,,0,1,207,118
127,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,3,Mandeep Singh,CH Gayle,A Nehra,0,,0,1,207,117
128,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,4,Mandeep Singh,CH Gayle,A Nehra,0,,2,3,205,116
129,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,5,Mandeep Singh,CH Gayle,A Nehra,0,,4,7,201,115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203850,1024,2016,Bangalore,2016-05-29,Royal Challengers Bangalore,Sunrisers Hyderabad,Sunrisers Hyderabad,bat,Sunrisers Hyderabad,8,...,2,Sachin Baby,CJ Jordan,B Kumar,0,,2,194,15,4
203851,1024,2016,Bangalore,2016-05-29,Royal Challengers Bangalore,Sunrisers Hyderabad,Sunrisers Hyderabad,bat,Sunrisers Hyderabad,8,...,3,Sachin Baby,CJ Jordan,B Kumar,0,CJ Jordan,0,194,15,3
203852,1024,2016,Bangalore,2016-05-29,Royal Challengers Bangalore,Sunrisers Hyderabad,Sunrisers Hyderabad,bat,Sunrisers Hyderabad,8,...,4,Iqbal Abdulla,Sachin Baby,B Kumar,0,,1,195,14,2
203853,1024,2016,Bangalore,2016-05-29,Royal Challengers Bangalore,Sunrisers Hyderabad,Sunrisers Hyderabad,bat,Sunrisers Hyderabad,8,...,5,Sachin Baby,Iqbal Abdulla,B Kumar,0,,1,196,13,1


In [27]:
list(delivery_df['player_dismissed'].unique())[:2]

[nan, 'Mandeep Singh']

In [28]:

# filling nan values with "0"

delivery_df['player_dismissed'] = delivery_df['player_dismissed'].fillna("0")

# now we will convert this player_dismissed col into a boolean col
# if the player is not dismissed then it's 0 else its 1

delivery_df['player_dismissed'] = delivery_df['player_dismissed'].apply(lambda x:x if x=="0" else "1")

# converting string to int

delivery_df['player_dismissed'] = delivery_df['player_dismissed'].astype('int')


delivery_df['player_dismissed'].unique()


array([0, 1])

In [30]:
# wickets left

wickets = delivery_df.groupby('Match_id')['player_dismissed'].cumsum().values

delivery_df['wickets_left'] = 10-wickets

delivery_df

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,winner,win_by_runs,...,Batsman,Non-Striker,Bowler,is_super_over,player_dismissed,Total_y,current_score,runs_left,balls_left,wickets_left
125,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,CH Gayle,Mandeep Singh,A Nehra,0,0,1,1,207,119,10
126,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Mandeep Singh,CH Gayle,A Nehra,0,0,0,1,207,118,10
127,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Mandeep Singh,CH Gayle,A Nehra,0,0,0,1,207,117,10
128,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Mandeep Singh,CH Gayle,A Nehra,0,0,2,3,205,116,10
129,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,Mandeep Singh,CH Gayle,A Nehra,0,0,4,7,201,115,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203850,1024,2016,Bangalore,2016-05-29,Royal Challengers Bangalore,Sunrisers Hyderabad,Sunrisers Hyderabad,bat,Sunrisers Hyderabad,8,...,Sachin Baby,CJ Jordan,B Kumar,0,0,2,194,15,4,4
203851,1024,2016,Bangalore,2016-05-29,Royal Challengers Bangalore,Sunrisers Hyderabad,Sunrisers Hyderabad,bat,Sunrisers Hyderabad,8,...,Sachin Baby,CJ Jordan,B Kumar,0,1,0,194,15,3,3
203852,1024,2016,Bangalore,2016-05-29,Royal Challengers Bangalore,Sunrisers Hyderabad,Sunrisers Hyderabad,bat,Sunrisers Hyderabad,8,...,Iqbal Abdulla,Sachin Baby,B Kumar,0,0,1,195,14,2,3
203853,1024,2016,Bangalore,2016-05-29,Royal Challengers Bangalore,Sunrisers Hyderabad,Sunrisers Hyderabad,bat,Sunrisers Hyderabad,8,...,Sachin Baby,Iqbal Abdulla,B Kumar,0,0,1,196,13,1,3


In [31]:
# current run rate
# It is a common practice to express run rates in cricket as runs per over, so the score is multiplied by 6.


delivery_df['cur_run_rate'] = (delivery_df['current_score']*6)/(120-delivery_df['balls_left']) 

# required run rate

delivery_df['req_run_rate'] = (delivery_df['runs_left']*6)/(delivery_df['balls_left'])


delivery_df.head(3)

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,winner,win_by_runs,...,Bowler,is_super_over,player_dismissed,Total_y,current_score,runs_left,balls_left,wickets_left,cur_run_rate,req_run_rate
125,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,A Nehra,0,0,1,1,207,119,10,6.0,10.436975
126,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,A Nehra,0,0,0,1,207,118,10,3.0,10.525424
127,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,A Nehra,0,0,0,1,207,117,10,2.0,10.615385


In [32]:
def resultfun(row):
    return 1 if row['Batting_team'] == row['winner'] else 0

In [33]:
delivery_df['result'] = delivery_df.apply(resultfun,axis=1)
delivery_df.head()

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,winner,win_by_runs,...,is_super_over,player_dismissed,Total_y,current_score,runs_left,balls_left,wickets_left,cur_run_rate,req_run_rate,result
125,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,0,0,1,1,207,119,10,6.0,10.436975,0
126,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,0,0,0,1,207,118,10,3.0,10.525424,0
127,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,0,0,0,1,207,117,10,2.0,10.615385,0
128,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,0,0,2,3,205,116,10,4.5,10.603448,0
129,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Sunrisers Hyderabad,35,...,0,0,4,7,201,115,10,8.4,10.486957,0


In [34]:
# sn.countplot(delivery_df['result'])

In [35]:
final_df = delivery_df[['Batting_team','Bowling_team','city','runs_left',
                        'balls_left','wickets_left','Total_x','cur_run_rate',
                        'req_run_rate','result']]

final_df.head()

Unnamed: 0,Batting_team,Bowling_team,city,runs_left,balls_left,wickets_left,Total_x,cur_run_rate,req_run_rate,result
125,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,207,119,10,208,6.0,10.436975,0
126,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,207,118,10,208,3.0,10.525424,0
127,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,207,117,10,208,2.0,10.615385,0
128,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,205,116,10,208,4.5,10.603448,0
129,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,201,115,10,208,8.4,10.486957,0


In [36]:
final_df.shape

(98313, 10)

In [37]:
final_df.isnull().sum()

Batting_team     0
Bowling_team     0
city             0
runs_left        0
balls_left       0
wickets_left     0
Total_x          0
cur_run_rate     0
req_run_rate    11
result           0
dtype: int64

In [38]:
# dropping of null values


final_df = final_df.dropna()

final_df.isnull().sum()

Batting_team    0
Bowling_team    0
city            0
runs_left       0
balls_left      0
wickets_left    0
Total_x         0
cur_run_rate    0
req_run_rate    0
result          0
dtype: int64

In [39]:
final_df[['runs_left', 'balls_left', 'wickets_left', 'Total_x',
    'cur_run_rate', 'req_run_rate']]

Unnamed: 0,runs_left,balls_left,wickets_left,Total_x,cur_run_rate,req_run_rate
125,207,119,10,208,6.000000,10.436975
126,207,118,10,208,3.000000,10.525424
127,207,117,10,208,2.000000,10.615385
128,205,116,10,208,4.500000,10.603448
129,201,115,10,208,8.400000,10.486957
...,...,...,...,...,...,...
203850,15,4,4,209,10.034483,22.500000
203851,15,3,3,209,9.948718,30.000000
203852,14,2,3,209,9.915254,42.000000
203853,13,1,3,209,9.882353,78.000000


In [40]:
final_df = final_df[final_df['balls_left'] != 0]

In [41]:
data = final_df.copy()
print(data)
test = data['result']
# 
train = data.drop(['result'],axis = 1)

# Replace inf and negative values in 'cur_run_rate' with zeros
train['cur_run_rate'] = train['cur_run_rate'].apply(lambda x: 0 if x <= 0 or np.isinf(x) else x)

# train['Batting_team'].unique()
# train.head()

                       Batting_team         Bowling_team       city  \
125     Royal Challengers Bangalore  Sunrisers Hyderabad  Hyderabad   
126     Royal Challengers Bangalore  Sunrisers Hyderabad  Hyderabad   
127     Royal Challengers Bangalore  Sunrisers Hyderabad  Hyderabad   
128     Royal Challengers Bangalore  Sunrisers Hyderabad  Hyderabad   
129     Royal Challengers Bangalore  Sunrisers Hyderabad  Hyderabad   
...                             ...                  ...        ...   
203849  Royal Challengers Bangalore  Sunrisers Hyderabad  Bangalore   
203850  Royal Challengers Bangalore  Sunrisers Hyderabad  Bangalore   
203851  Royal Challengers Bangalore  Sunrisers Hyderabad  Bangalore   
203852  Royal Challengers Bangalore  Sunrisers Hyderabad  Bangalore   
203853  Royal Challengers Bangalore  Sunrisers Hyderabad  Bangalore   

        runs_left  balls_left  wickets_left  Total_x  cur_run_rate  \
125           207         119            10      208      6.000000   
126    

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Define X (features) and y (target)
X = train
y = test

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Batting_team', 'Bowling_team', 'city']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42)

# Create and train the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=45)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# You can also print a classification report for more detailed performance metrics
print(classification_report(y_test, y_pred))


Accuracy: 0.9560923059477235
              precision    recall  f1-score   support

           0       0.96      0.95      0.95     42113
           1       0.95      0.96      0.96     46072

    accuracy                           0.96     88185
   macro avg       0.96      0.96      0.96     88185
weighted avg       0.96      0.96      0.96     88185


In [44]:
from sklearn.model_selection import train_test_split

# Define X (features) and y (target)
X = train  # Assuming train contains your feature data
y = test   # Assuming test contains your target data

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Batting_team', 'Bowling_team', 'city']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

# Perform Train-Validation-Test Split
# First, split the data into training and the rest (combined validation and test)
X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.7, random_state=42)

# Then, split the rest into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.7, random_state=42)
depth = [5,10,15,20,50]
res = []
for dpt in depth:
    
    # Create and train the RandomForestClassifier on the training set
    clf = RandomForestClassifier(n_estimators=10, random_state=45, max_depth=dpt)
    clf.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_val_pred = clf.predict(X_val)
    
    # Evaluate the classifier's performance on the validation set
    validation_accuracy = accuracy_score(y_val, y_val_pred)
    print("Validation Accuracy:", validation_accuracy)
    
    # Make predictions on the test set
    y_test_pred = clf.predict(X_test)
    
    # Evaluate the classifier's performance on the test set
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print("Test Accuracy:", test_accuracy)

    # You can also print classification reports for more detailed performance metrics
    print("Validation Classification Report:")
    print(classification_report(y_val, y_val_pred))
    
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred))
    rn = [dpt, validation_accuracy, test_accuracy]
    res.append(rn)
res_df = pd.DataFrame.from_records(res)  
print(res_df)


Validation Accuracy: 0.7942262830482115
Test Accuracy: 0.796430133505509
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.76      0.78      9832
           1       0.79      0.83      0.81     10744

    accuracy                           0.79     20576
   macro avg       0.79      0.79      0.79     20576
weighted avg       0.79      0.79      0.79     20576

Test Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.76      0.78     22861
           1       0.79      0.83      0.81     25152

    accuracy                           0.80     48013
   macro avg       0.80      0.79      0.80     48013
weighted avg       0.80      0.80      0.80     48013
Validation Accuracy: 0.8325719284603421
Test Accuracy: 0.8338783246204153
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.83      0.83      

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Define X (features) and y (target)
X = train  # Assuming train contains your feature data
y = test   # Assuming test contains your target data

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Batting_team', 'Bowling_team', 'city']

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)
ts_size = [0.2,0.3,0.4,0.5]
qwe= []
for asd in ts_size:    
    # Perform Train-Validation-Test Split
    # First, split the data into training and the rest (combined validation and test)
    X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=asd, random_state=42)
    
    # Then, split the rest into validation and test sets
    X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=asd, random_state=42)
    
    # Create and train the Gaussian Naive Bayes classifier on the training set
    clf_nb = GaussianNB()
    clf_nb.fit(X_train, y_train)
    
    # Make predictions on the validation set for Naive Bayes
    y_val_pred_nb = clf_nb.predict(X_val)
    
    # Evaluate the classifier's performance on the validation set for Naive Bayes
    validation_accuracy_nb = accuracy_score(y_val, y_val_pred_nb)
    
    # Make predictions on the test set for Naive Bayes
    y_test_pred_nb = clf_nb.predict(X_test)
    
    # Evaluate the classifier's performance on the test set for Naive Bayes
    test_accuracy_nb = accuracy_score(y_test, y_test_pred_nb)
    
    # Print validation and test accuracy
    print("Validation Accuracy (Naive Bayes):", validation_accuracy_nb)
    print("Test Accuracy (Naive Bayes):", test_accuracy_nb)
    
    # You can also print classification reports for more detailed performance metrics
    print("Validation Classification Report (Naive Bayes):")
    print(classification_report(y_val, y_val_pred_nb))
    
    print("Test Classification Report (Naive Bayes):")
    print(classification_report(y_test, y_test_pred_nb))
    
    rn = [asd, validation_accuracy_nb, test_accuracy_nb]
    qwe.append(rn)
res_df = pd.DataFrame.from_records(qwe)  
print(res_df)


Validation Accuracy (Naive Bayes): 0.6829112712891497
Test Accuracy (Naive Bayes): 0.676530612244898
Validation Classification Report (Naive Bayes):
              precision    recall  f1-score   support

           0       0.62      0.83      0.71      7398
           1       0.78      0.55      0.65      8279

    accuracy                           0.68     15677
   macro avg       0.70      0.69      0.68     15677
weighted avg       0.71      0.68      0.68     15677

Test Classification Report (Naive Bayes):
              precision    recall  f1-score   support

           0       0.62      0.83      0.71      1890
           1       0.77      0.53      0.63      2030

    accuracy                           0.68      3920
   macro avg       0.70      0.68      0.67      3920
weighted avg       0.70      0.68      0.67      3920
Validation Accuracy (Naive Bayes): 0.6845839813374806
Test Accuracy (Naive Bayes): 0.6830706429300374
Validation Classification Report (Naive Bayes):
      