### Importing the required libraries

In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle
from sklearn.metrics import mean_absolute_error

### Ignoring warnings and setting proper display options

In [79]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.expand_frame_repr',False)
pd.set_option('max_colwidth',None)

### Loading the dataset

In [82]:
df = pd.read_csv('T20I_ball_by_ball_updated.csv')

# checking the first 5 rows of df
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,211048,2004/05,2005-02-17,Eden Park,1,0.1,Australia,New Zealand,AC Gilchrist,MJ Clarke,DR Tuffey,0,1,1.0,,,,,,,,
1,211048,2004/05,2005-02-17,Eden Park,1,0.2,Australia,New Zealand,AC Gilchrist,MJ Clarke,DR Tuffey,0,1,,,,1.0,,,,,
2,211048,2004/05,2005-02-17,Eden Park,1,0.3,Australia,New Zealand,MJ Clarke,AC Gilchrist,DR Tuffey,0,0,,,,,,,,,
3,211048,2004/05,2005-02-17,Eden Park,1,0.4,Australia,New Zealand,MJ Clarke,AC Gilchrist,DR Tuffey,1,0,,,,,,,,,
4,211048,2004/05,2005-02-17,Eden Park,1,0.5,Australia,New Zealand,AC Gilchrist,MJ Clarke,DR Tuffey,1,0,,,,,,,,,


In [84]:
# sorting the dataset by matchid, innings, ball in ascending order and reassigning it back to df
df = df.sort_values(['match_id','innings','ball'],ascending=[True,True,True]).reset_index(drop=True)

### Feature Engineering

In [87]:
# assiging ball column to overs
df['overs'] = df['ball']

# changing the datatype of ball to str
df['ball'] = df['ball'].astype(str)

# extracting the over number part from ball
df['over'] = df['ball'].str[0:-1]

# replacing '.' with '' in over
df['over'] = df['over'].str.replace('.','')

# extracting the ball number from ball
df['ball'] = df['ball'].str[-1:]

# adding a new column for total runs
df['total_runs'] = df['runs_off_bat'] + df['extras']

# keeping only the necessary columns
df = df[['match_id','innings','batting_team','bowling_team','venue','striker','overs','over','ball','total_runs','player_dismissed']]

# renaming the column match_id with id
df = df.rename(columns={'match_id': 'id'})

# creating a list of main teams
main_teams = ['Australia', 'New Zealand', 'England', 'South Africa',
       'West Indies', 'Sri Lanka', 'Pakistan', 'India',
       'Bangladesh']

# filtering df to only have data of the main teams
df = df[(df.batting_team.isin(main_teams)) & (df.bowling_team.isin(main_teams))]

# replacing NaN values with 0
df = df.replace(np.nan,0)

In [89]:
# inspecting df for changes
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,venue,striker,overs,over,ball,total_runs,player_dismissed
0,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.1,0,1,0,0
1,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.2,0,2,1,0
2,211028,1,England,Australia,The Rose Bowl,GO Jones,0.3,0,3,0,0
3,211028,1,England,Australia,The Rose Bowl,GO Jones,0.4,0,4,0,0
4,211028,1,England,Australia,The Rose Bowl,GO Jones,0.5,0,5,0,0


In [91]:
# adding a new column to aggreagate data to show total runs for every unique id & innings
df['inn_total'] = df.groupby(['id','innings'])['total_runs'].transform('sum')

In [93]:
# inspecting df for changes
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,venue,striker,overs,over,ball,total_runs,player_dismissed,inn_total
0,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.1,0,1,0,0,179
1,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.2,0,2,1,0,179
2,211028,1,England,Australia,The Rose Bowl,GO Jones,0.3,0,3,0,0,179
3,211028,1,England,Australia,The Rose Bowl,GO Jones,0.4,0,4,0,0,179
4,211028,1,England,Australia,The Rose Bowl,GO Jones,0.5,0,5,0,0,179


In [95]:
# adding a new column to aggreagate data to show cumulative total runs for every unique id & innings
df['current_score'] = df.groupby(['id','innings'])['total_runs'].transform('cumsum')

In [97]:
# inspecting df for changes
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,venue,striker,overs,over,ball,total_runs,player_dismissed,inn_total,current_score
0,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.1,0,1,0,0,179,0
1,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.2,0,2,1,0,179,1
2,211028,1,England,Australia,The Rose Bowl,GO Jones,0.3,0,3,0,0,179,1
3,211028,1,England,Australia,The Rose Bowl,GO Jones,0.4,0,4,0,0,179,1
4,211028,1,England,Australia,The Rose Bowl,GO Jones,0.5,0,5,0,0,179,1


In [103]:
# checking the unique values of the player_dismissed column
df['player_dismissed'].unique()

array([0, 'GO Jones', 'A Flintoff', ..., 'CS Fraser', 'Nahida Akter',
       'RS Gayakwad'], dtype=object)

In [105]:
# converting the player_dismissed column to a binary column conatining 0s & 1s
df['player_dismissed'] = np.where(df['player_dismissed']==0,0,1)

In [109]:
#adding a new column to aggreagate data to show cumulative total wickets for every unique id & innings
df['current_wickets'] = df.groupby(['id','innings'])['player_dismissed'].transform('cumsum')

In [111]:
# inspecting for changes
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,venue,striker,overs,over,ball,total_runs,player_dismissed,inn_total,current_score,current_wickets
0,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.1,0,1,0,0,179,0,0
1,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.2,0,2,1,0,179,1,0
2,211028,1,England,Australia,The Rose Bowl,GO Jones,0.3,0,3,0,0,179,1,0
3,211028,1,England,Australia,The Rose Bowl,GO Jones,0.4,0,4,0,0,179,1,0
4,211028,1,England,Australia,The Rose Bowl,GO Jones,0.5,0,5,0,0,179,1,0


In [113]:
# creating a temporary dataset to get rolling total of runs for the last 30 balls
temp = df.groupby(['id','innings'])['total_runs'].rolling(min_periods=1,window=30).sum().reset_index()

In [119]:
temp.head()

Unnamed: 0,id,innings,level_2,total_runs
0,211028,1,0,0.0
1,211028,1,1,1.0
2,211028,1,2,1.0
3,211028,1,3,1.0
4,211028,1,4,1.0


In [123]:
# coverting the total_runs column to a list and assigning it to a new column runs_last30balls
df['runs_last30balls'] = temp['total_runs'].to_list()

In [131]:
# inspecting first 35 rows to check the rolling total of runs for last 30 balls
df.head(35)

Unnamed: 0,id,innings,batting_team,bowling_team,venue,striker,overs,over,ball,total_runs,player_dismissed,inn_total,current_score,current_wickets,runs_last30balls
0,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.1,0,1,0,0,179,0,0,0.0
1,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.2,0,2,1,0,179,1,0,1.0
2,211028,1,England,Australia,The Rose Bowl,GO Jones,0.3,0,3,0,0,179,1,0,1.0
3,211028,1,England,Australia,The Rose Bowl,GO Jones,0.4,0,4,0,0,179,1,0,1.0
4,211028,1,England,Australia,The Rose Bowl,GO Jones,0.5,0,5,0,0,179,1,0,1.0
5,211028,1,England,Australia,The Rose Bowl,GO Jones,0.6,0,6,1,0,179,2,0,2.0
6,211028,1,England,Australia,The Rose Bowl,GO Jones,0.7,0,7,2,0,179,4,0,4.0
7,211028,1,England,Australia,The Rose Bowl,ME Trescothick,1.1,1,1,0,0,179,4,0,4.0
8,211028,1,England,Australia,The Rose Bowl,ME Trescothick,1.2,1,2,0,0,179,4,0,4.0
9,211028,1,England,Australia,The Rose Bowl,ME Trescothick,1.3,1,3,1,0,179,5,0,5.0


In [133]:
# modifying the temporary dataset to get rolling total of players dismissed for the last 30 balls
temp = df.groupby(['id','innings'])['player_dismissed'].rolling(min_periods=1,window=30).sum().reset_index()

In [135]:
temp.head()

Unnamed: 0,id,innings,level_2,player_dismissed
0,211028,1,0,0.0
1,211028,1,1,0.0
2,211028,1,2,0.0
3,211028,1,3,0.0
4,211028,1,4,0.0


In [137]:
# coverting the player_dismissed column to a list and assigning it to a new column wickets_last30balls
df['wickets_last30balls'] = temp['player_dismissed'].to_list()

In [139]:
df.head(35)

Unnamed: 0,id,innings,batting_team,bowling_team,venue,striker,overs,over,ball,total_runs,player_dismissed,inn_total,current_score,current_wickets,runs_last30balls,wickets_last30balls
0,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.1,0,1,0,0,179,0,0,0.0,0.0
1,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.2,0,2,1,0,179,1,0,1.0,0.0
2,211028,1,England,Australia,The Rose Bowl,GO Jones,0.3,0,3,0,0,179,1,0,1.0,0.0
3,211028,1,England,Australia,The Rose Bowl,GO Jones,0.4,0,4,0,0,179,1,0,1.0,0.0
4,211028,1,England,Australia,The Rose Bowl,GO Jones,0.5,0,5,0,0,179,1,0,1.0,0.0
5,211028,1,England,Australia,The Rose Bowl,GO Jones,0.6,0,6,1,0,179,2,0,2.0,0.0
6,211028,1,England,Australia,The Rose Bowl,GO Jones,0.7,0,7,2,0,179,4,0,4.0,0.0
7,211028,1,England,Australia,The Rose Bowl,ME Trescothick,1.1,1,1,0,0,179,4,0,4.0,0.0
8,211028,1,England,Australia,The Rose Bowl,ME Trescothick,1.2,1,2,0,0,179,4,0,4.0,0.0
9,211028,1,England,Australia,The Rose Bowl,ME Trescothick,1.3,1,3,1,0,179,5,0,5.0,0.0


In [141]:
# assigning the column total runs to a new column named dots_last30balls
df['dots_last30balls'] = df['total_runs']
# converting the dots_last30balls column to a binary column conatining 0s & 1s
df['dots_last30balls'] = np.where(df['dots_last30balls']==0,1,0)

In [143]:
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,venue,striker,overs,over,ball,total_runs,player_dismissed,inn_total,current_score,current_wickets,runs_last30balls,wickets_last30balls,dots_last30balls
0,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.1,0,1,0,0,179,0,0,0.0,0.0,1
1,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.2,0,2,1,0,179,1,0,1.0,0.0,0
2,211028,1,England,Australia,The Rose Bowl,GO Jones,0.3,0,3,0,0,179,1,0,1.0,0.0,1
3,211028,1,England,Australia,The Rose Bowl,GO Jones,0.4,0,4,0,0,179,1,0,1.0,0.0,1
4,211028,1,England,Australia,The Rose Bowl,GO Jones,0.5,0,5,0,0,179,1,0,1.0,0.0,1


In [145]:
# creating a temporary dataset to get rolling total of dots for the last 30 balls
temp = df.groupby(['id','innings'])['dots_last30balls'].rolling(min_periods=1,window=30).sum().reset_index()

In [147]:
temp.head()

Unnamed: 0,id,innings,level_2,dots_last30balls
0,211028,1,0,1.0
1,211028,1,1,1.0
2,211028,1,2,2.0
3,211028,1,3,3.0
4,211028,1,4,4.0


In [149]:
# coverting the dots_last30balls column to a list and re-assigning it to the same column 
df['dots_last30balls'] = temp['dots_last30balls'].to_list()

In [151]:
# assigning the column total runs to a new column named boundaries_last30balls
df['boundaries_last30balls'] = df['total_runs']
# converting the boundaries_last30balls column to a binary column conatining 0s & 1s
df['boundaries_last30balls'] = np.where(df['boundaries_last30balls']>3,1,0)

In [153]:
# creating a temporary dataset to get rolling total of boundaries for the last 30 balls
temp = df.groupby(['id','innings'])['boundaries_last30balls'].rolling(min_periods=1,window=30).sum().reset_index()

In [155]:
# coverting the boundaries_last30balls column to a list and re-assigning it to the same column 
df['boundaries_last30balls'] = temp['boundaries_last30balls'].to_list()

In [157]:
df.head(35)

Unnamed: 0,id,innings,batting_team,bowling_team,venue,striker,overs,over,ball,total_runs,player_dismissed,inn_total,current_score,current_wickets,runs_last30balls,wickets_last30balls,dots_last30balls,boundaries_last30balls
0,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.1,0,1,0,0,179,0,0,0.0,0.0,1.0,0.0
1,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.2,0,2,1,0,179,1,0,1.0,0.0,1.0,0.0
2,211028,1,England,Australia,The Rose Bowl,GO Jones,0.3,0,3,0,0,179,1,0,1.0,0.0,2.0,0.0
3,211028,1,England,Australia,The Rose Bowl,GO Jones,0.4,0,4,0,0,179,1,0,1.0,0.0,3.0,0.0
4,211028,1,England,Australia,The Rose Bowl,GO Jones,0.5,0,5,0,0,179,1,0,1.0,0.0,4.0,0.0
5,211028,1,England,Australia,The Rose Bowl,GO Jones,0.6,0,6,1,0,179,2,0,2.0,0.0,4.0,0.0
6,211028,1,England,Australia,The Rose Bowl,GO Jones,0.7,0,7,2,0,179,4,0,4.0,0.0,4.0,0.0
7,211028,1,England,Australia,The Rose Bowl,ME Trescothick,1.1,1,1,0,0,179,4,0,4.0,0.0,5.0,0.0
8,211028,1,England,Australia,The Rose Bowl,ME Trescothick,1.2,1,2,0,0,179,4,0,4.0,0.0,6.0,0.0
9,211028,1,England,Australia,The Rose Bowl,ME Trescothick,1.3,1,3,1,0,179,5,0,5.0,0.0,6.0,0.0


In [159]:
# filtering df to include only the necessary columns
df = df[['id','innings','batting_team','bowling_team','venue','striker','overs','over','ball','total_runs','player_dismissed','current_score','current_wickets','runs_last30balls','wickets_last30balls','dots_last30balls','boundaries_last30balls','inn_total']]

In [161]:
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,venue,striker,overs,over,ball,total_runs,player_dismissed,current_score,current_wickets,runs_last30balls,wickets_last30balls,dots_last30balls,boundaries_last30balls,inn_total
0,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.1,0,1,0,0,0,0,0.0,0.0,1.0,0.0,179
1,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.2,0,2,1,0,1,0,1.0,0.0,1.0,0.0,179
2,211028,1,England,Australia,The Rose Bowl,GO Jones,0.3,0,3,0,0,1,0,1.0,0.0,2.0,0.0,179
3,211028,1,England,Australia,The Rose Bowl,GO Jones,0.4,0,4,0,0,1,0,1.0,0.0,3.0,0.0,179
4,211028,1,England,Australia,The Rose Bowl,GO Jones,0.5,0,5,0,0,1,0,1.0,0.0,4.0,0.0,179


In [163]:
# adding a new column to aggreagate data to show cumulative total innings wickets for every unique id & innings
df['inn_wickets'] = df.groupby(['id','innings'])['player_dismissed'].transform('sum')

In [165]:
# Added 2 new features 1) Run rate 2) Run rate in last 30 balls

df['run_rate'] = df.apply(lambda x : x['current_score'] / x['overs'], axis = 1)

df['runrate_last30balls'] = df.apply(lambda x : x['runs_last30balls'] / x['overs'], axis = 1)

In [169]:
df.head(35)

Unnamed: 0,id,innings,batting_team,bowling_team,venue,striker,overs,over,ball,total_runs,player_dismissed,current_score,current_wickets,runs_last30balls,wickets_last30balls,dots_last30balls,boundaries_last30balls,inn_total,inn_wickets,run_rate,runrate_last30balls
0,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.1,0,1,0,0,0,0,0.0,0.0,1.0,0.0,179,8,0.0,0.0
1,211028,1,England,Australia,The Rose Bowl,ME Trescothick,0.2,0,2,1,0,1,0,1.0,0.0,1.0,0.0,179,8,5.0,5.0
2,211028,1,England,Australia,The Rose Bowl,GO Jones,0.3,0,3,0,0,1,0,1.0,0.0,2.0,0.0,179,8,3.333333,3.333333
3,211028,1,England,Australia,The Rose Bowl,GO Jones,0.4,0,4,0,0,1,0,1.0,0.0,3.0,0.0,179,8,2.5,2.5
4,211028,1,England,Australia,The Rose Bowl,GO Jones,0.5,0,5,0,0,1,0,1.0,0.0,4.0,0.0,179,8,2.0,2.0
5,211028,1,England,Australia,The Rose Bowl,GO Jones,0.6,0,6,1,0,2,0,2.0,0.0,4.0,0.0,179,8,3.333333,3.333333
6,211028,1,England,Australia,The Rose Bowl,GO Jones,0.7,0,7,2,0,4,0,4.0,0.0,4.0,0.0,179,8,5.714286,5.714286
7,211028,1,England,Australia,The Rose Bowl,ME Trescothick,1.1,1,1,0,0,4,0,4.0,0.0,5.0,0.0,179,8,3.636364,3.636364
8,211028,1,England,Australia,The Rose Bowl,ME Trescothick,1.2,1,2,0,0,4,0,4.0,0.0,6.0,0.0,179,8,3.333333,3.333333
9,211028,1,England,Australia,The Rose Bowl,ME Trescothick,1.3,1,3,1,0,5,0,5.0,0.0,6.0,0.0,179,8,3.846154,3.846154


In [171]:
# checking the data types of the columns of df
df.dtypes

id                          int64
innings                     int64
batting_team               object
bowling_team               object
venue                      object
striker                    object
overs                     float64
over                       object
ball                       object
total_runs                  int64
player_dismissed            int32
current_score               int64
current_wickets             int32
runs_last30balls          float64
wickets_last30balls       float64
dots_last30balls          float64
boundaries_last30balls    float64
inn_total                   int64
inn_wickets                 int32
run_rate                  float64
runrate_last30balls       float64
dtype: object

In [179]:
# creating a dictionary for converting columns to their desired types
convert_dict = {'runs_last30balls' : int,
                'wickets_last30balls': int,
                'dots_last30balls' : int,
                'boundaries_last30balls' : int
               }
# converting the mentioned columns of df to the format specified above
df = df.astype(convert_dict)

In [181]:
df.dtypes

id                          int64
innings                     int64
batting_team               object
bowling_team               object
venue                      object
striker                    object
overs                     float64
over                       object
ball                       object
total_runs                  int64
player_dismissed            int32
current_score               int64
current_wickets             int32
runs_last30balls            int32
wickets_last30balls         int32
dots_last30balls            int32
boundaries_last30balls      int32
inn_total                   int64
inn_wickets                 int32
run_rate                  float64
runrate_last30balls       float64
dtype: object

In [183]:
# one-hot encoding the batting_team and bowling_team columns
df = pd.get_dummies(data = df, columns = ['batting_team', 'bowling_team'])

In [187]:
# inspecting df for changes
df.head()

Unnamed: 0,id,innings,venue,striker,overs,over,ball,total_runs,player_dismissed,current_score,current_wickets,runs_last30balls,wickets_last30balls,dots_last30balls,boundaries_last30balls,inn_total,inn_wickets,run_rate,runrate_last30balls,batting_team_Australia,batting_team_Bangladesh,batting_team_England,batting_team_India,batting_team_New Zealand,batting_team_Pakistan,batting_team_South Africa,batting_team_Sri Lanka,batting_team_West Indies,bowling_team_Australia,bowling_team_Bangladesh,bowling_team_England,bowling_team_India,bowling_team_New Zealand,bowling_team_Pakistan,bowling_team_South Africa,bowling_team_Sri Lanka,bowling_team_West Indies
0,211028,1,The Rose Bowl,ME Trescothick,0.1,0,1,0,0,0,0,0,0,1,0,179,8,0.0,0.0,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
1,211028,1,The Rose Bowl,ME Trescothick,0.2,0,2,1,0,1,0,1,0,1,0,179,8,5.0,5.0,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
2,211028,1,The Rose Bowl,GO Jones,0.3,0,3,0,0,1,0,1,0,2,0,179,8,3.333333,3.333333,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
3,211028,1,The Rose Bowl,GO Jones,0.4,0,4,0,0,1,0,1,0,3,0,179,8,2.5,2.5,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
4,211028,1,The Rose Bowl,GO Jones,0.5,0,5,0,0,1,0,1,0,4,0,179,8,2.0,2.0,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False


In [185]:
# checking the columns of df
df.columns

Index(['id', 'innings', 'venue', 'striker', 'overs', 'over', 'ball',
       'total_runs', 'player_dismissed', 'current_score', 'current_wickets',
       'runs_last30balls', 'wickets_last30balls', 'dots_last30balls',
       'boundaries_last30balls', 'inn_total', 'inn_wickets', 'run_rate',
       'runrate_last30balls', 'batting_team_Australia',
       'batting_team_Bangladesh', 'batting_team_England', 'batting_team_India',
       'batting_team_New Zealand', 'batting_team_Pakistan',
       'batting_team_South Africa', 'batting_team_Sri Lanka',
       'batting_team_West Indies', 'bowling_team_Australia',
       'bowling_team_Bangladesh', 'bowling_team_England', 'bowling_team_India',
       'bowling_team_New Zealand', 'bowling_team_Pakistan',
       'bowling_team_South Africa', 'bowling_team_Sri Lanka',
       'bowling_team_West Indies'],
      dtype='object')

In [189]:
# filtering df to include the listed columns
df = df[['id','batting_team_Australia',
       'batting_team_Bangladesh', 'batting_team_England', 'batting_team_India',
       'batting_team_New Zealand', 'batting_team_Pakistan',
       'batting_team_South Africa', 'batting_team_Sri Lanka',
       'batting_team_West Indies', 'bowling_team_Australia',
       'bowling_team_Bangladesh', 'bowling_team_England', 'bowling_team_India',
       'bowling_team_New Zealand', 'bowling_team_Pakistan',
       'bowling_team_South Africa', 'bowling_team_Sri Lanka',
       'bowling_team_West Indies', 'overs', 'current_score', 'current_wickets',
       'runs_last30balls', 'wickets_last30balls', 'dots_last30balls',
       'boundaries_last30balls', 'inn_total', 'run_rate', 'runrate_last30balls']]

In [193]:
df.head()

Unnamed: 0,id,batting_team_Australia,batting_team_Bangladesh,batting_team_England,batting_team_India,batting_team_New Zealand,batting_team_Pakistan,batting_team_South Africa,batting_team_Sri Lanka,batting_team_West Indies,bowling_team_Australia,bowling_team_Bangladesh,bowling_team_England,bowling_team_India,bowling_team_New Zealand,bowling_team_Pakistan,bowling_team_South Africa,bowling_team_Sri Lanka,bowling_team_West Indies,overs,current_score,current_wickets,runs_last30balls,wickets_last30balls,dots_last30balls,boundaries_last30balls,inn_total,run_rate,runrate_last30balls
0,211028,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,0.1,0,0,0,0,1,0,179,0.0,0.0
1,211028,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,0.2,1,0,1,0,1,0,179,5.0,5.0
2,211028,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,0.3,1,0,1,0,2,0,179,3.333333,3.333333
3,211028,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,0.4,1,0,1,0,3,0,179,2.5,2.5
4,211028,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,0.5,1,0,1,0,4,0,179,2.0,2.0


In [205]:
### Performing Train Test Split

In [195]:
# extracting the independent variables into X by dropping the dependent variable
X = df.drop(labels = ['inn_total', 'id'], axis=1)
# extracting the values of the dependednt (target) variable
y = df['inn_total'].values

In [197]:
y

array([179, 179, 179, ...,  71,  71,  71], dtype=int64)

In [199]:
X.head()

Unnamed: 0,batting_team_Australia,batting_team_Bangladesh,batting_team_England,batting_team_India,batting_team_New Zealand,batting_team_Pakistan,batting_team_South Africa,batting_team_Sri Lanka,batting_team_West Indies,bowling_team_Australia,bowling_team_Bangladesh,bowling_team_England,bowling_team_India,bowling_team_New Zealand,bowling_team_Pakistan,bowling_team_South Africa,bowling_team_Sri Lanka,bowling_team_West Indies,overs,current_score,current_wickets,runs_last30balls,wickets_last30balls,dots_last30balls,boundaries_last30balls,run_rate,runrate_last30balls
0,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,0.1,0,0,0,0,1,0,0.0,0.0
1,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,0.2,1,0,1,0,1,0,5.0,5.0
2,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,0.3,1,0,1,0,2,0,3.333333,3.333333
3,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,0.4,1,0,1,0,3,0,2.5,2.5
4,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,0.5,1,0,1,0,4,0,2.0,2.0


In [201]:
len(X)

239117

In [203]:
len(y)

239117

In [207]:
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [209]:
X_train = X_train.values # Converts the X_train DataFrame to a NumPy array.
X_test = X_test.values   # Converts the X_test DataFrame to a NumPy array.
X_train = np.asarray(X_train).astype(np.float32) # Ensures X_train is explicitly converted to a NumPy array of type float32.
X_test = np.asarray(X_test).astype(np.float32)   # Ensures X_test is explicitly converted to a NumPy array of type float32.

In [211]:
# printing the shape of the split and converted data sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(179337, 27) (59780, 27) (179337,) (59780,)


In [53]:
# Training the Linear Regression Model

In [55]:
# Initialize a LinearRegression model from scikit-learn.
LR_model = LinearRegression()
# Trains (fits) the LinearRegression model on the training data (X_train and y_train).
LR_model.fit(X_train, y_train)

In [213]:
# Defining the filename for saving the Linear Regression model. The file will be named 'lr-model.pkl'.
filename = 'lr-model.pkl'
# Saves (serializes) the trained Linear Regression model to the specified file using pickle in write-binary mode ('wb').
pickle.dump(LR_model, open(filename, 'wb'))

In [215]:
# Generate predictions for the test data (X_test) using the trained Linear Regression model.
prediction = LR_model.predict(X_test)
# Calculating the Mean Absolute Error (MAE) between the true labels (y_test) and the predicted values.
mean_absolute_error(y_test, prediction)

18.757883198120076

In [217]:
# Defining a function to predict innings scores

def score_prediction(Bat_Team, Bowl_Team, overs, total_score, total_wickets, prev_30_runs, prev_30_wickets, prev_30_dots, prev_30_boundaries, RR, P30RR):

    temp_array = list()

    if Bat_Team == 'AUS':
        temp_array = temp_array + [1, 0, 0, 0, 0, 0, 0, 0, 0]
    elif Bat_Team == 'BAN':
        temp_array = temp_array + [0, 1, 0, 0, 0, 0, 0, 0, 0]
    elif Bat_Team == 'ENG':
        temp_array = temp_array + [0, 0, 1, 0, 0, 0, 0, 0, 0]
    elif Bat_Team == 'IND':
        temp_array = temp_array + [0, 0, 0, 1, 0, 0, 0, 0, 0]
    elif Bat_Team == 'NZ':
        temp_array = temp_array + [0, 0, 0, 0, 1, 0, 0, 0, 0]
    elif Bat_Team == 'PAK':
        temp_array = temp_array + [0, 0, 0, 0, 0, 1, 0, 0, 0]
    elif Bat_Team == 'SA':
        temp_array = temp_array + [0, 0, 0, 0, 0, 0, 1, 0, 0]
    elif Bat_Team == 'SL':
        temp_array = temp_array + [0, 0, 0, 0, 0, 0, 0, 1, 0]
    elif Bat_Team == 'WI':
        temp_array = temp_array + [0, 0, 0, 0, 0, 0, 0, 0, 1]

    if Bowl_Team == 'AUS':
        temp_array = temp_array + [1, 0, 0, 0, 0, 0, 0, 0, 0]
    elif Bowl_Team == 'BAN':
        temp_array = temp_array + [0, 1, 0, 0, 0, 0, 0, 0, 0]
    elif Bowl_Team == 'ENG':
        temp_array = temp_array + [0, 0, 1, 0, 0, 0, 0, 0, 0]
    elif Bowl_Team == 'IND':
        temp_array = temp_array + [0, 0, 0, 1, 0, 0, 0, 0, 0]
    elif Bowl_Team == 'NZ':
        temp_array = temp_array + [0, 0, 0, 0, 1, 0, 0, 0, 0]
    elif Bowl_Team == 'PAK':
        temp_array = temp_array + [0, 0, 0, 0, 0, 1, 0, 0, 0]
    elif Bat_Team == 'SA':
        temp_array = temp_array + [0, 0, 0, 0, 0, 0, 1, 0, 0]
    elif Bowl_Team == 'SL':
        temp_array = temp_array + [0, 0, 0, 0, 0, 0, 0, 1, 0]
    elif Bowl_Team == 'WI':
        temp_array = temp_array + [0, 0, 0, 0, 0, 0, 0, 0, 1]

    temp_array = temp_array + [overs, current_score, current_wickets, runs_last30balls, wickets_last30balls, dots_last30balls,
       boundaries_last30balls, run_rate, runrate_last30balls]
    data = np.array([temp_array])
    my_prediction = int(LR_model.predict(data)) # passing the prepared data to predict the innings score

    print('Predicted score :', my_prediction)
    print('Predicted score range :', my_prediction - 19, 'to ', my_prediction + 19)

In [219]:
# defining variables for a test scenario
Bat_team = 'AUS'
Bowl_team = 'IND'
overs = 10.3               # current overs
current_score = 67         # current score
current_wickets = 2        # current_wickets
runs_last30balls = 37      # current_runs_in_last_30_balls
wickets_last30balls = 1    # current_wickets_in_last_30_balls
dots_last30balls = 8       # current_dots_in_last_30_balls
boundaries_last30balls = 5 # current_boundaries_in_last_30_balls
run_rate = 6.38            # current run rate
runrate_last30balls = 7.4  # run_rate in last 30 balls

# calling the score-prediction function using the variables defined above
score_prediction(Bat_team, Bowl_team, overs, current_score, current_wickets, runs_last30balls, wickets_last30balls, dots_last30balls,
       boundaries_last30balls, run_rate, runrate_last30balls)



Predicted score : 136
Predicted score range : 117 to  155


In [None]:
# Thus IND will score 136 by the end of 20 overs and the predicted score range would be between 117 to 155.