################################################################################

**Author**: _Pradip Kumar Das_

**License:** https://github.com/PradipKumarDas/Competitions/blob/main/LICENSE

**Profile & Contact:** [LinkedIn](https://www.linkedin.com/in/daspradipkumar/) | [GitHub](https://github.com/PradipKumarDas) | [Kaggle](https://www.kaggle.com/pradipkumardas) | pradipkumardas@hotmail.com (Email)

################################################################################

# IPL 2021 Match Score Prediction Contest Organized by IIT Madras Online B.Sc. Programme Team

## A Common Sense based Non-Machine Learning Model

In [1]:
# Imports required packages

import pandas as pd

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
# Sets Pandas option to show all columns

pd.set_option('display.max_columns', None)

In [6]:
# Downloads the dataset from cricsheet.org/downloads

!wget https://cricsheet.org/downloads/ipl_csv2.zip -P Data

# Unzips the data

!unzip Data/ipl_csv2.zip -d Data

--2021-04-21 10:33:20--  https://cricsheet.org/downloads/ipl_csv2.zip
Resolving cricsheet.org (cricsheet.org)... 64.90.49.16
Connecting to cricsheet.org (cricsheet.org)|64.90.49.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3286876 (3.1M) [application/zip]
Saving to: ‘Data/ipl_csv2.zip’


2021-04-21 10:33:30 (392 KB/s) - ‘Data/ipl_csv2.zip’ saved [3286876/3286876]

Archive:  Data/ipl_csv2.zip
  inflating: Data/README.txt         
  inflating: Data/all_matches.csv    
  inflating: Data/335982.csv         
  inflating: Data/335983.csv         
  inflating: Data/335984.csv         
  inflating: Data/335986.csv         
  inflating: Data/335985.csv         
  inflating: Data/335987.csv         
  inflating: Data/335988.csv         
  inflating: Data/335989.csv         
  inflating: Data/335990.csv         
  inflating: Data/335991.csv         
  inflating: Data/335993.csv         
  inflating: Data/335992.csv         
  inflating: Data/335994.csv         
  i

In [7]:
# Load data for all matches

data = pd.read_csv("Data/all_matches.csv")

In [8]:
# Checks top few rows of the data

data.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,0,1,,,,1.0,,,,,
1,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,,,,,,,,,
2,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,1,1.0,,,,,,,,
3,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,,,,,,,,,
4,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,,,,,,,,,


In [9]:
# Checks bottom rows of the data

data.tail()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
196795,1254070,2021,2021-04-20,"MA Chidambaram Stadium, Chepauk, Chennai",2,18.6,Delhi Capitals,Mumbai Indians,Lalit Yadav,SO Hetmyer,JJ Bumrah,1,0,,,,,,,,,
196796,1254070,2021,2021-04-20,"MA Chidambaram Stadium, Chepauk, Chennai",2,18.7,Delhi Capitals,Mumbai Indians,SO Hetmyer,Lalit Yadav,JJ Bumrah,2,0,,,,,,,,,
196797,1254070,2021,2021-04-20,"MA Chidambaram Stadium, Chepauk, Chennai",2,18.8,Delhi Capitals,Mumbai Indians,SO Hetmyer,Lalit Yadav,JJ Bumrah,1,0,,,,,,,,,
196798,1254070,2021,2021-04-20,"MA Chidambaram Stadium, Chepauk, Chennai",2,19.1,Delhi Capitals,Mumbai Indians,SO Hetmyer,Lalit Yadav,KA Pollard,4,0,,,,,,,,,
196799,1254070,2021,2021-04-20,"MA Chidambaram Stadium, Chepauk, Chennai",2,19.2,Delhi Capitals,Mumbai Indians,SO Hetmyer,Lalit Yadav,KA Pollard,0,1,,1.0,,,,,,,


In [10]:
# Checks for missing values

data.isna().sum()

match_id                       0
season                         0
start_date                     0
venue                          0
innings                        0
ball                           0
batting_team                   0
bowling_team                   0
striker                        0
non_striker                    0
bowler                         0
runs_off_bat                   0
extras                         0
wides                     190846
noballs                   196005
byes                      196285
legbyes                   193650
penalty                   196798
wicket_type               187100
player_dismissed          187100
other_wicket_type         196800
other_player_dismissed    196800
dtype: int64

In [11]:
# Inserts a new calculated column called "score_off_ball" which is a sum of values in 
# columns "runs_off_bat" and "extras" just after column "extras" to indicate contributing score off the ball
# to make calculating total score at the end of the match easy

data.insert(loc=13, column="score_off_ball", value=data.runs_off_bat + data.extras)

#### Checks for venues for duplicates with slightly different names, if any, and updates the rows with the same venue names accordingly

In [19]:
data.venue.value_counts().sort_index()

Arun Jaitley Stadium                                    3356
Barabati Stadium                                        1695
Brabourne Stadium                                       2719
Buffalo Park                                             715
De Beers Diamond Oval                                    726
Dr DY Patil Sports Academy                              3993
Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium     3037
Dubai International Cricket Stadium                     8080
Eden Gardens                                           17988
Feroz Shah Kotla                                       13950
Green Park                                               921
Himachal Pradesh Cricket Association Stadium            2159
Holkar Cricket Stadium                                  1965
IS Bindra Stadium                                      13269
JSCA International Stadium Complex                      1671
Kingsmead                                               3643
M. A. Chidambaram Stadiu

In [16]:
# Updates these venues that are mentioned in different names with same name

data.venue[data.venue.str.contains("Brabourne", 
                                                 case=False)] = "Brabourne Stadium"
data.venue[data.venue.str.contains("Chinnaswamy", 
                                                 case=False)] = "M. Chinnaswamy Stadium"
data.venue[data.venue.str.contains("Chidambaram", 
                                                 case=False)] = "M. A. Chidambaram Stadium"
data.venue[data.venue.str.contains(r'Punjab Cricket|IS Bindra|Inderjit Singh Bindra', 
                                                 case=False)] = "IS Bindra Stadium"
data.venue[data.venue.str.contains("Rajiv Gandhi", 
                                                 case=False)] = "Rajiv Gandhi International Cricket Stadium"
data.venue[data.venue.str.contains("Wankhede", 
                                                 case=False)] = "Wankhede Stadium"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the c

#### Checks for teams for duplicates with slightly different names, if any, and updates the rows with the same team names accordingly

In [20]:
data.batting_team.append(data.bowling_team).value_counts().sort_index()

Chennai Super Kings            43405
Deccan Chargers                18073
Delhi Capitals                  8998
Delhi Daredevils               37511
Gujarat Lions                   7111
Kings XI Punjab                45129
Kochi Tuskers Kerala            3196
Kolkata Knight Riders          45923
Mumbai Indians                 49956
Pune Warriors                  10900
Punjab Kings                     710
Rajasthan Royals               38842
Rising Pune Supergiant          3828
Rising Pune Supergiants         3195
Royal Challengers Bangalore    46521
Sunrisers Hyderabad            30302
dtype: int64

In [21]:
# Updates team name from "Delhi Daredevils" with the new name "Delhi Capitals"

data.batting_team[data.batting_team.str.contains("Delhi Daredevils", case=False)] = "Delhi Capitals"
data.bowling_team[data.bowling_team.str.contains("Delhi Daredevils", case=False)] = "Delhi Capitals"

# Updates team name from "Kings XI Punjab" with the new name "Punjab Kings"

data.batting_team[data.batting_team.str.contains("Kings XI Punjab", case=False)] = "Punjab Kings"
data.bowling_team[data.bowling_team.str.contains("Kings XI Punjab", case=False)] = "Punjab Kings"

# Updates appropriate team name for "Rising Pune Supergiant"

data.batting_team[data.batting_team.str.contains("Rising Pune Supergiants", case=False)] = "Rising Pune Supergiant"
data.bowling_team[data.bowling_team.str.contains("Rising Pune Supergiants", case=False)] = "Rising Pune Supergiant"


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to b

## Let's first build a common sense based non-machine learning model as baselined model. 

In [50]:
# First, lets have small dataset for that

data_non_ml = data[data.ball <= 5.6]

data_non_ml = data_non_ml[["match_id", 
                           "venue", 
                           "innings",
                           "batting_team", 
                           "bowling_team", 
                           "score_off_ball"]]

In [51]:
# Resets its index

data_non_ml.reset_index()

Unnamed: 0,index,match_id,venue,innings,batting_team,bowling_team,score_off_ball
0,0,335982,M. Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,1
1,1,335982,M. Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,0
2,2,335982,M. Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,1
3,3,335982,M. Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,0
4,4,335982,M. Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,0
...,...,...,...,...,...,...,...
61715,196711,1254070,M. A. Chidambaram Stadium,2,Delhi Capitals,Mumbai Indians,1
61716,196712,1254070,M. A. Chidambaram Stadium,2,Delhi Capitals,Mumbai Indians,1
61717,196713,1254070,M. A. Chidambaram Stadium,2,Delhi Capitals,Mumbai Indians,0
61718,196714,1254070,M. A. Chidambaram Stadium,2,Delhi Capitals,Mumbai Indians,1


In [52]:
# Calculates the match wise total score after end of 6 overs

data_non_ml = data_non_ml.groupby(
    ["match_id", "venue", "innings", "batting_team", "bowling_team"]).score_off_ball.sum()

In [53]:
# Checks how the scores look

data_non_ml

match_id  venue                      innings  batting_team                 bowling_team               
335982    M. Chinnaswamy Stadium     1        Kolkata Knight Riders        Royal Challengers Bangalore    61
                                     2        Royal Challengers Bangalore  Kolkata Knight Riders          26
335983    IS Bindra Stadium          1        Chennai Super Kings          Punjab Kings                   53
                                     2        Punjab Kings                 Chennai Super Kings            56
335984    Feroz Shah Kotla           1        Rajasthan Royals             Delhi Capitals                 40
                                                                                                          ..
1254068   Wankhede Stadium           2        Delhi Capitals               Punjab Kings                   62
1254069   Wankhede Stadium           1        Chennai Super Kings          Rajasthan Royals               46
                         

In [54]:
# Resets the multi-index created due to grouping to get tabular data

data_non_ml = data_non_ml.reset_index()

In [55]:
# Checks how to data looks

data_non_ml

Unnamed: 0,match_id,venue,innings,batting_team,bowling_team,score_off_ball
0,335982,M. Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,61
1,335982,M. Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,26
2,335983,IS Bindra Stadium,1,Chennai Super Kings,Punjab Kings,53
3,335983,IS Bindra Stadium,2,Punjab Kings,Chennai Super Kings,56
4,335984,Feroz Shah Kotla,1,Rajasthan Royals,Delhi Capitals,40
...,...,...,...,...,...,...
1679,1254068,Wankhede Stadium,2,Delhi Capitals,Punjab Kings,62
1680,1254069,Wankhede Stadium,1,Chennai Super Kings,Rajasthan Royals,46
1681,1254069,Wankhede Stadium,2,Rajasthan Royals,Chennai Super Kings,45
1682,1254070,M. A. Chidambaram Stadium,1,Mumbai Indians,Delhi Capitals,55


In [56]:
# Renames column "score_off_ball" to "score_6_over"

data_non_ml.rename(columns={"score_off_ball": "score_6_over"}, inplace = True)

In [76]:
# Seperates score column from the features data set

y_train = data_non_ml[["score_6_over"]]
X_train = data_non_ml[["match_id", "venue", "innings", "batting_team", "bowling_team"]]

In [77]:
# Splits the available data into train and test data sets

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, shuffle = False)

In [78]:
# Resets indexes

X_test.reset_index(drop = True, inplace = True)
y_test.reset_index(drop = True, inplace = True)

In [171]:
# Creates an array to store the predicted innings scores

predictions = []

In [172]:
# Loops through each innings in test data, predicts the innings score and saves it into predictions array

for i, test_match_innings in X_test.iterrows():
    past_matches = X_train[(X_train.venue == test_match_innings.venue) & 
                           (X_train.innings == test_match_innings.innings) & 
                           (X_train.batting_team == test_match_innings.batting_team) & 
                           (X_train.bowling_team == test_match_innings.bowling_team)]
    if past_matches.shape[0] > 0:
        predictions.append(y_train.iloc[past_matches.index].mean()[0])
    else:
        past_matches = X_train[(X_train.innings == test_match_innings.innings) & 
                           (X_train.batting_team == test_match_innings.batting_team) & 
                           (X_train.bowling_team == test_match_innings.bowling_team)]
        if past_matches.shape[0] > 0:
            predictions.append(y_train.iloc[past_matches.index].mean()[0])
        else:
            past_matches = X_train[(X_train.innings == test_match_innings.innings) & 
                               (X_train.batting_team == test_match_innings.batting_team)]
            if past_matches.shape[0] > 0:
                predictions.append(y_train.iloc[past_matches.index].mean()[0])
            else:
                predictions.append(y_train.mean()[0])

In [191]:
# Calculates the mean absolute error against predicted scores across all test data

mean_absolute_error(y_test.score_6_over.array, predictions)

10.71942048690337

## Above mentioned Mean Absolute Error which is 10.7 is basically from a common sense and non-machine learning based model and will be treated as baselined performance. Now, to justify developing machine learning models, the score prediction accuracy must be higher than that of this common sense based baselined model. Refer next notebooks that will apply machine learning techniques aiming improvisation to achieve better prediction accuracy.