In [389]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline


In [390]:
matches = pd.read_csv('matches.csv')
deliveries = pd.read_csv('deliveries.csv')

In [391]:
matches.head()

Unnamed: 0,id,Season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,IPL-2017,Hyderabad,05-04-2017,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,2,IPL-2017,Pune,06-04-2017,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,
2,3,IPL-2017,Rajkot,07-04-2017,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,
3,4,IPL-2017,Indore,08-04-2017,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,
4,5,IPL-2017,Bangalore,08-04-2017,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,,


In [392]:
deliveries.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,4,0,4,,,
3,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,4,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
4,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,5,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,2,2,,,


In [393]:
matches.shape
deliveries.shape

(179078, 21)

In [394]:
total_scoredf = deliveries.groupby(['match_id', 'inning']).sum()['total_runs'].reset_index()

In [395]:
total_scoredf.head()

Unnamed: 0,match_id,inning,total_runs
0,1,1,207
1,1,2,172
2,2,1,184
3,2,2,187
4,3,1,183


In [396]:
total_scoredf = total_scoredf[total_scoredf['inning'] == 1]
total_scoredf['total_runs'] = total_scoredf['total_runs'] + 1


In [397]:
total_scoredf.head()

Unnamed: 0,match_id,inning,total_runs
0,1,1,208
2,2,1,185
4,3,1,184
6,4,1,164
8,5,1,158


In [398]:
matches = matches.merge(total_scoredf[['match_id','total_runs']], left_on='id', right_on='match_id')


In [399]:
matches.head()

Unnamed: 0,id,Season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3,match_id,total_runs
0,1,IPL-2017,Hyderabad,05-04-2017,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,,1,208
1,2,IPL-2017,Pune,06-04-2017,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,,2,185
2,3,IPL-2017,Rajkot,07-04-2017,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,,3,184
3,4,IPL-2017,Indore,08-04-2017,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,,4,164
4,5,IPL-2017,Bangalore,08-04-2017,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,,,5,158


In [400]:
matches['team1'].unique()

array(['Sunrisers Hyderabad', 'Mumbai Indians', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Royal Challengers Bangalore',
       'Kolkata Knight Riders', 'Delhi Daredevils', 'Kings XI Punjab',
       'Chennai Super Kings', 'Rajasthan Royals', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Rising Pune Supergiants',
       'Delhi Capitals'], dtype=object)

In [401]:
teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals', 'Mumbai Indians',
        'Kings XI Punjab', 'Royal Challengers Bangalore','Sunrisers Hyderabad', 'Delhi Capitals']

In [402]:
matches['team1'] = matches['team1'].str.replace('Delhi Daredevils', 'Delhi Capitals')
matches['team2'] = matches['team2'].str.replace('Delhi Daredevils', 'Delhi Capitals')

In [403]:
matches['team1'] = matches['team1'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
matches['team2'] = matches['team2'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')

In [404]:
matches = matches[matches['team1'].isin(teams)]
matches = matches[matches['team2'].isin(teams)]

In [405]:
matches.shape

(641, 20)

In [406]:
matches = matches[matches['dl_applied'] == 0]

In [407]:
matches = matches[['match_id', 'total_runs', 'city', 'winner']]


In [408]:
deliveries_df = matches.merge(deliveries, on='match_id')
deliveries_df

Unnamed: 0,match_id,total_runs_x,city,winner,inning,batting_team,bowling_team,over,ball,batsman,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs_y,player_dismissed,dismissal_kind,fielder
0,1,208,Hyderabad,Sunrisers Hyderabad,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,...,0,0,0,0,0,0,0,,,
1,1,208,Hyderabad,Sunrisers Hyderabad,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,...,0,0,0,0,0,0,0,,,
2,1,208,Hyderabad,Sunrisers Hyderabad,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,...,0,0,0,0,4,0,4,,,
3,1,208,Hyderabad,Sunrisers Hyderabad,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,4,DA Warner,...,0,0,0,0,0,0,0,,,
4,1,208,Hyderabad,Sunrisers Hyderabad,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,5,DA Warner,...,0,0,0,0,0,2,2,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149573,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,2,RA Jadeja,...,0,0,0,0,1,0,1,,,
149574,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,3,SR Watson,...,0,0,0,0,2,0,2,,,
149575,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,4,SR Watson,...,0,0,0,0,1,0,1,SR Watson,run out,KH Pandya
149576,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,5,SN Thakur,...,0,0,0,0,2,0,2,,,


In [409]:
deliveries_df = deliveries_df[deliveries_df['inning'] == 2]

In [410]:
deliveries_df.shape

(72413, 24)

In [411]:
deliveries_df['total_runs_y'] = pd.to_numeric(deliveries_df['total_runs_y'], errors='coerce')
deliveries_df['current_score']= deliveries_df.groupby('match_id')['total_runs_y'].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deliveries_df['total_runs_y'] = pd.to_numeric(deliveries_df['total_runs_y'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deliveries_df['current_score']= deliveries_df.groupby('match_id')['total_runs_y'].cumsum()


In [412]:
deliveries_df['runs_left'] = deliveries_df['total_runs_x'] - deliveries_df['current_score'] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deliveries_df['runs_left'] = deliveries_df['total_runs_x'] - deliveries_df['current_score']


In [413]:
deliveries_df

Unnamed: 0,match_id,total_runs_x,city,winner,inning,batting_team,bowling_team,over,ball,batsman,...,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs_y,player_dismissed,dismissal_kind,fielder,current_score,runs_left
125,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,CH Gayle,...,0,0,1,0,1,,,,1,207
126,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,Mandeep Singh,...,0,0,0,0,0,,,,1,207
127,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,3,Mandeep Singh,...,0,0,0,0,0,,,,1,207
128,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,4,Mandeep Singh,...,0,0,2,0,2,,,,3,205
129,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,5,Mandeep Singh,...,0,0,4,0,4,,,,7,201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149573,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,2,RA Jadeja,...,0,0,1,0,1,,,,152,1
149574,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,3,SR Watson,...,0,0,2,0,2,,,,154,-1
149575,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,4,SR Watson,...,0,0,1,0,1,SR Watson,run out,KH Pandya,155,-2
149576,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,5,SN Thakur,...,0,0,2,0,2,,,,157,-4


In [414]:
deliveries_df['balls_left'] = 126 - (deliveries_df['over']*6 + deliveries_df['ball'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deliveries_df['balls_left'] = 126 - (deliveries_df['over']*6 + deliveries_df['ball'])


In [415]:
deliveries_df

Unnamed: 0,match_id,total_runs_x,city,winner,inning,batting_team,bowling_team,over,ball,batsman,...,penalty_runs,batsman_runs,extra_runs,total_runs_y,player_dismissed,dismissal_kind,fielder,current_score,runs_left,balls_left
125,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,CH Gayle,...,0,1,0,1,,,,1,207,119
126,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,Mandeep Singh,...,0,0,0,0,,,,1,207,118
127,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,3,Mandeep Singh,...,0,0,0,0,,,,1,207,117
128,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,4,Mandeep Singh,...,0,2,0,2,,,,3,205,116
129,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,5,Mandeep Singh,...,0,4,0,4,,,,7,201,115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149573,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,2,RA Jadeja,...,0,1,0,1,,,,152,1,4
149574,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,3,SR Watson,...,0,2,0,2,,,,154,-1,3
149575,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,4,SR Watson,...,0,1,0,1,SR Watson,run out,KH Pandya,155,-2,2
149576,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,5,SN Thakur,...,0,2,0,2,,,,157,-4,1


In [416]:
deliveries_df['player_dismissed'] = deliveries_df['player_dismissed'].fillna("0")
deliveries_df['player_dismissed'] = deliveries_df['player_dismissed'].apply(lambda x: x if x == "0" else "1")
deliveries_df['player_dismissed'] = deliveries_df['player_dismissed'].astype(int)
wickets = deliveries_df.groupby('match_id')['player_dismissed'].cumsum()
deliveries_df['wickets'] = 10 - wickets
deliveries_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deliveries_df['player_dismissed'] = deliveries_df['player_dismissed'].fillna("0")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deliveries_df['player_dismissed'] = deliveries_df['player_dismissed'].apply(lambda x: x if x == "0" else "1")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deliveries_df

Unnamed: 0,match_id,total_runs_x,city,winner,inning,batting_team,bowling_team,over,ball,batsman,...,batsman_runs,extra_runs,total_runs_y,player_dismissed,dismissal_kind,fielder,current_score,runs_left,balls_left,wickets
125,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,CH Gayle,...,1,0,1,0,,,1,207,119,10
126,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,Mandeep Singh,...,0,0,0,0,,,1,207,118,10
127,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,3,Mandeep Singh,...,0,0,0,0,,,1,207,117,10
128,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,4,Mandeep Singh,...,2,0,2,0,,,3,205,116,10
129,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,5,Mandeep Singh,...,4,0,4,0,,,7,201,115,10


In [417]:
deliveries_df['crr']=deliveries_df['current_score']*6/(120-deliveries_df['balls_left'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deliveries_df['crr']=deliveries_df['current_score']*6/(120-deliveries_df['balls_left'])


In [418]:
deliveries_df['rrr'] = deliveries_df['runs_left']*6/deliveries_df['balls_left']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deliveries_df['rrr'] = deliveries_df['runs_left']*6/deliveries_df['balls_left']


In [419]:
deliveries_df

Unnamed: 0,match_id,total_runs_x,city,winner,inning,batting_team,bowling_team,over,ball,batsman,...,total_runs_y,player_dismissed,dismissal_kind,fielder,current_score,runs_left,balls_left,wickets,crr,rrr
125,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,CH Gayle,...,1,0,,,1,207,119,10,6.000000,10.436975
126,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,Mandeep Singh,...,0,0,,,1,207,118,10,3.000000,10.525424
127,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,3,Mandeep Singh,...,0,0,,,1,207,117,10,2.000000,10.615385
128,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,4,Mandeep Singh,...,2,0,,,3,205,116,10,4.500000,10.603448
129,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,5,Mandeep Singh,...,4,0,,,7,201,115,10,8.400000,10.486957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149573,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,2,RA Jadeja,...,1,0,,,152,1,4,5,7.862069,1.500000
149574,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,3,SR Watson,...,2,0,,,154,-1,3,5,7.897436,-2.000000
149575,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,4,SR Watson,...,1,1,run out,KH Pandya,155,-2,2,4,7.881356,-6.000000
149576,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,5,SN Thakur,...,2,0,,,157,-4,1,4,7.915966,-24.000000


In [420]:
def result(row):
    return 1 if row['batting_team'] == row['winner'] else 0

In [421]:
deliveries_df['result']=deliveries_df.apply(result, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deliveries_df['result']=deliveries_df.apply(result, axis=1)


In [422]:
deliveries_df

Unnamed: 0,match_id,total_runs_x,city,winner,inning,batting_team,bowling_team,over,ball,batsman,...,player_dismissed,dismissal_kind,fielder,current_score,runs_left,balls_left,wickets,crr,rrr,result
125,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,1,CH Gayle,...,0,,,1,207,119,10,6.000000,10.436975,0
126,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,2,Mandeep Singh,...,0,,,1,207,118,10,3.000000,10.525424,0
127,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,3,Mandeep Singh,...,0,,,1,207,117,10,2.000000,10.615385,0
128,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,4,Mandeep Singh,...,0,,,3,205,116,10,4.500000,10.603448,0
129,1,208,Hyderabad,Sunrisers Hyderabad,2,Royal Challengers Bangalore,Sunrisers Hyderabad,1,5,Mandeep Singh,...,0,,,7,201,115,10,8.400000,10.486957,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149573,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,2,RA Jadeja,...,0,,,152,1,4,5,7.862069,1.500000,0
149574,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,3,SR Watson,...,0,,,154,-1,3,5,7.897436,-2.000000,0
149575,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,4,SR Watson,...,1,run out,KH Pandya,155,-2,2,4,7.881356,-6.000000,0
149576,11415,153,Hyderabad,Mumbai Indians,2,Chennai Super Kings,Mumbai Indians,20,5,SN Thakur,...,0,,,157,-4,1,4,7.915966,-24.000000,0


In [423]:
final_df = deliveries_df[['batting_team','bowling_team','city','runs_left','balls_left','wickets','total_runs_x','crr','rrr','result']]

In [424]:
final_df.head()

Unnamed: 0,batting_team,bowling_team,city,runs_left,balls_left,wickets,total_runs_x,crr,rrr,result
125,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,207,119,10,208,6.0,10.436975,0
126,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,207,118,10,208,3.0,10.525424,0
127,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,207,117,10,208,2.0,10.615385,0
128,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,205,116,10,208,4.5,10.603448,0
129,Royal Challengers Bangalore,Sunrisers Hyderabad,Hyderabad,201,115,10,208,8.4,10.486957,0


In [425]:
final_df = final_df.sample(final_df.shape[0])

In [426]:
final_df.sample()
final_df.dropna(inplace=True)
final_df = final_df[final_df['balls_left'] != 0]

In [427]:
X = final_df.iloc[:,:-1]
y = final_df.iloc[:,-1]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [428]:
X_train

Unnamed: 0,batting_team,bowling_team,city,runs_left,balls_left,wickets,total_runs_x,crr,rrr
26917,Mumbai Indians,Deccan Chargers,Centurion,96,78,8,146,7.142857,7.384615
26723,Delhi Daredevils,Kolkata Knight Riders,Durban,44,33,9,155,7.655172,8.000000
22348,Mumbai Indians,Deccan Chargers,Durban,45,21,5,169,7.515152,12.857143
73168,Chennai Super Kings,Mumbai Indians,Chennai,113,85,8,149,6.171429,7.976471
55187,Mumbai Indians,Kings XI Punjab,Chandigarh,151,113,10,164,11.142857,8.017699
...,...,...,...,...,...,...,...,...,...
43718,Royal Challengers Bangalore,Kolkata Knight Riders,Bangalore,103,75,9,161,7.733333,8.240000
46644,Delhi Daredevils,Deccan Chargers,Delhi,13,1,3,146,6.705882,78.000000
139886,Sunrisers Hyderabad,Mumbai Indians,Hyderabad,50,23,3,143,5.752577,13.043478
131130,Rajasthan Royals,Chennai Super Kings,Jaipur,99,60,8,189,9.000000,9.900000


In [429]:


trf = ColumnTransformer(
    [('trf', OneHotEncoder(sparse_output=False, drop='first'), ['batting_team', 'bowling_team', 'city'])],
    remainder='passthrough'
)


In [430]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',LogisticRegression(solver='liblinear'))
])