################################################################################

**Author**: _Pradip Kumar Das_

**License:** https://github.com/PradipKumarDas/Competitions/blob/main/LICENSE

**Profile & Contact:** [LinkedIn](https://www.linkedin.com/in/daspradipkumar/) | [GitHub](https://github.com/PradipKumarDas) | [Kaggle](https://www.kaggle.com/pradipkumardas) | pradipkumardas@hotmail.com (Email)

################################################################################

# IPL 2021 Match Score Prediction Contest Organized by IIT Madras Online B.Sc. Programme Team

## Few Shallow Machine Learning Based Regression Models

In [74]:
# Imports required packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

import pickle

In [2]:
# Sets Pandas option to show all columns

pd.set_option('display.max_columns', None)

In [3]:
# Downloads the dataset from cricsheet.org/downloads (overwrites the file if exists)

!wget https://cricsheet.org/downloads/ipl_csv2.zip -O Data/ipl_csv2.zip

# Unzips the data (overwrites existing files having same name)

!unzip -o -d Data Data/ipl_csv2.zip all_matches.csv README.txt

--2021-04-23 10:14:56--  https://cricsheet.org/downloads/ipl_csv2.zip
Resolving cricsheet.org (cricsheet.org)... 64.90.49.16
Connecting to cricsheet.org (cricsheet.org)|64.90.49.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3298742 (3.1M) [application/zip]
Saving to: ‘Data/ipl_csv2.zip’


2021-04-23 10:15:01 (906 KB/s) - ‘Data/ipl_csv2.zip’ saved [3298742/3298742]

Archive:  Data/ipl_csv2.zip
  inflating: Data/README.txt         
  inflating: Data/all_matches.csv    


In [4]:
# Load data for all matches

data = pd.read_csv("Data/all_matches.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# Checks top few rows of the data

data.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,0,1,,,,1.0,,,,,
1,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,,,,,,,,,
2,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,1,1.0,,,,,,,,
3,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,,,,,,,,,
4,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,,,,,,,,,


In [6]:
# Checks bottom rows of the data

data.tail()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
197512,1254073,2021,2021-04-22,"Wankhede Stadium, Mumbai",2,15.6,Royal Challengers Bangalore,Rajasthan Royals,V Kohli,D Padikkal,C Sakariya,0,0,,,,,,,,,
197513,1254073,2021,2021-04-22,"Wankhede Stadium, Mumbai",2,16.1,Royal Challengers Bangalore,Rajasthan Royals,D Padikkal,V Kohli,Mustafizur Rahman,4,0,,,,,,,,,
197514,1254073,2021,2021-04-22,"Wankhede Stadium, Mumbai",2,16.2,Royal Challengers Bangalore,Rajasthan Royals,D Padikkal,V Kohli,Mustafizur Rahman,0,1,,,,1.0,,,,,
197515,1254073,2021,2021-04-22,"Wankhede Stadium, Mumbai",2,16.3,Royal Challengers Bangalore,Rajasthan Royals,V Kohli,D Padikkal,Mustafizur Rahman,1,0,,,,,,,,,
197516,1254073,2021,2021-04-22,"Wankhede Stadium, Mumbai",2,16.4,Royal Challengers Bangalore,Rajasthan Royals,D Padikkal,V Kohli,Mustafizur Rahman,0,5,5.0,,,,,,,,


In [7]:
# Checks for missing values

data.isna().sum()

match_id                       0
season                         0
start_date                     0
venue                          0
innings                        0
ball                           0
batting_team                   0
bowling_team                   0
striker                        0
non_striker                    0
bowler                         0
runs_off_bat                   0
extras                         0
wides                     191535
noballs                   196717
byes                      196995
legbyes                   194363
penalty                   197515
wicket_type               187784
player_dismissed          187784
other_wicket_type         197517
other_player_dismissed    197517
dtype: int64

In [8]:
# Inserts a new calculated column called "score_off_ball" which is a sum of values in 
# columns "runs_off_bat" and "extras" just after column "extras" to indicate contributing score off the ball
# to make calculating total score at the end of the match easy

data.insert(loc=13, column="score_off_ball", value=data.runs_off_bat + data.extras)

#### Checks for venues for duplicates with slightly different names, if any, and updates the rows with the same venue names accordingly

In [9]:
data.venue.value_counts().sort_index()

Arun Jaitley Stadium                                     3356
Barabati Stadium                                         1695
Brabourne Stadium                                        2469
Brabourne Stadium, Mumbai                                 250
Buffalo Park                                              715
De Beers Diamond Oval                                     726
Dr DY Patil Sports Academy                               3993
Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium      3037
Dubai International Cricket Stadium                      8080
Eden Gardens                                            17988
Feroz Shah Kotla                                        13950
Green Park                                                921
Himachal Pradesh Cricket Association Stadium             2159
Holkar Cricket Stadium                                   1965
JSCA International Stadium Complex                       1671
Kingsmead                                                3643
M Chinna

In [10]:
# Updates these venues that are mentioned in different names with same name

data.venue[data.venue.str.contains("Brabourne", 
                                                 case=False)] = "Brabourne Stadium"
data.venue[data.venue.str.contains("Chinnaswamy", 
                                                 case=False)] = "M. Chinnaswamy Stadium"
data.venue[data.venue.str.contains("Chidambaram", 
                                                 case=False)] = "M. A. Chidambaram Stadium"
data.venue[data.venue.str.contains(r'Punjab Cricket|IS Bindra|Inderjit Singh Bindra', 
                                                 case=False)] = "IS Bindra Stadium"
data.venue[data.venue.str.contains("Rajiv Gandhi", 
                                                 case=False)] = "Rajiv Gandhi International Cricket Stadium"
data.venue[data.venue.str.contains("Wankhede", 
                                                 case=False)] = "Wankhede Stadium"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the c

#### Checks for teams for duplicates with slightly different names, if any, and updates the rows with the same team names accordingly

In [11]:
data.batting_team.append(data.bowling_team).value_counts().sort_index()

Chennai Super Kings            43657
Deccan Chargers                18073
Delhi Capitals                  8998
Delhi Daredevils               37511
Gujarat Lions                   7111
Kings XI Punjab                45129
Kochi Tuskers Kerala            3196
Kolkata Knight Riders          46175
Mumbai Indians                 49956
Pune Warriors                  10900
Punjab Kings                     946
Rajasthan Royals               39071
Rising Pune Supergiant          3828
Rising Pune Supergiants         3195
Royal Challengers Bangalore    46750
Sunrisers Hyderabad            30538
dtype: int64

In [12]:
# Updates team name from "Delhi Daredevils" with the new name "Delhi Capitals"

data.batting_team[data.batting_team.str.contains("Delhi Daredevils", case=False)] = "Delhi Capitals"
data.bowling_team[data.bowling_team.str.contains("Delhi Daredevils", case=False)] = "Delhi Capitals"

# Updates team name from "Kings XI Punjab" with the new name "Punjab Kings"

data.batting_team[data.batting_team.str.contains("Kings XI Punjab", case=False)] = "Punjab Kings"
data.bowling_team[data.bowling_team.str.contains("Kings XI Punjab", case=False)] = "Punjab Kings"

# Updates appropriate team name for "Rising Pune Supergiant"

data.batting_team[data.batting_team.str.contains("Rising Pune Supergiants", case=False)] = "Rising Pune Supergiant"
data.bowling_team[data.bowling_team.str.contains("Rising Pune Supergiants", case=False)] = "Rising Pune Supergiant"


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to b

## Let's first build a simple linear regression machine learning model as baselined machine learning model as we did for common sense based non-machine learning model. 

In [13]:
# First, lets have small dataset for that

data_simple = data[data.ball <= 6.0][["match_id",
                                      "venue", 
                                       "innings",
                                       "batting_team", 
                                       "bowling_team", 
                                       "score_off_ball"]]

In [14]:
# Checks shape of the filtered data

data_simple.shape

(62273, 6)

In [15]:
# Resets its index

data_simple.reset_index(drop = True, inplace = True)

In [16]:
# Calculates the match wise total score after end of 6 overs

data_simple = data_simple.groupby(
    ["match_id", "venue", "innings", "batting_team", "bowling_team"]).score_off_ball.sum()

In [17]:
# Checks how the scores look

data_simple

match_id  venue                      innings  batting_team                 bowling_team               
335982    M. Chinnaswamy Stadium     1        Kolkata Knight Riders        Royal Challengers Bangalore    61
                                     2        Royal Challengers Bangalore  Kolkata Knight Riders          26
335983    IS Bindra Stadium          1        Chennai Super Kings          Punjab Kings                   53
                                     2        Punjab Kings                 Chennai Super Kings            63
335984    Feroz Shah Kotla           1        Rajasthan Royals             Delhi Capitals                 40
                                                                                                          ..
1254071   M. A. Chidambaram Stadium  2        Sunrisers Hyderabad          Punjab Kings                   50
1254072   Wankhede Stadium           1        Chennai Super Kings          Kolkata Knight Riders          54
                         

In [18]:
# Resets the multi-indexes of the series to get tabular data

data_simple = data_simple.reset_index()

In [19]:
# Checks once again how to data looks

data_simple

Unnamed: 0,match_id,venue,innings,batting_team,bowling_team,score_off_ball
0,335982,M. Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,61
1,335982,M. Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,26
2,335983,IS Bindra Stadium,1,Chennai Super Kings,Punjab Kings,53
3,335983,IS Bindra Stadium,2,Punjab Kings,Chennai Super Kings,63
4,335984,Feroz Shah Kotla,1,Rajasthan Royals,Delhi Capitals,40
...,...,...,...,...,...,...
1685,1254071,M. A. Chidambaram Stadium,2,Sunrisers Hyderabad,Punjab Kings,50
1686,1254072,Wankhede Stadium,1,Chennai Super Kings,Kolkata Knight Riders,54
1687,1254072,Wankhede Stadium,2,Kolkata Knight Riders,Chennai Super Kings,45
1688,1254073,Wankhede Stadium,1,Rajasthan Royals,Royal Challengers Bangalore,32


In [20]:
# Renames column "score_off_ball" to "score_6_overs"

data_simple.rename(columns={"score_off_ball": "score_6_overs"}, inplace = True)

In [21]:
data_simple

Unnamed: 0,match_id,venue,innings,batting_team,bowling_team,score_6_overs
0,335982,M. Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,61
1,335982,M. Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,26
2,335983,IS Bindra Stadium,1,Chennai Super Kings,Punjab Kings,53
3,335983,IS Bindra Stadium,2,Punjab Kings,Chennai Super Kings,63
4,335984,Feroz Shah Kotla,1,Rajasthan Royals,Delhi Capitals,40
...,...,...,...,...,...,...
1685,1254071,M. A. Chidambaram Stadium,2,Sunrisers Hyderabad,Punjab Kings,50
1686,1254072,Wankhede Stadium,1,Chennai Super Kings,Kolkata Knight Riders,54
1687,1254072,Wankhede Stadium,2,Kolkata Knight Riders,Chennai Super Kings,45
1688,1254073,Wankhede Stadium,1,Rajasthan Royals,Royal Challengers Bangalore,32


In [22]:
# Encodes venues with one-hot encoding technique

venue_count = len(data_simple.venue.unique())
venue_encoder = OneHotEncoder(handle_unknown='ignore')
venue_encoded = pd.DataFrame(venue_encoder.fit_transform(data_simple[["venue"]]).toarray(),
                            columns=[("venue_" + str(i)) for i in range(venue_count)])

# Saves the encoder into persistent store for later use

with open("Models/Venue_Encoder.pickle", "wb") as f:
    pickle.dump(venue_encoder, f, pickle.HIGHEST_PROTOCOL)

# Joins the encoded venue columns with the dataset

data_simple = data_simple.join(venue_encoded).drop(["venue"], axis = 1)

In [23]:
# Encodes innings with one-hot encoding technique

innings_count = len(data_simple.innings.unique())
innings_encoder = OneHotEncoder(handle_unknown='ignore')
innings_encoded = pd.DataFrame(innings_encoder.fit_transform(data_simple[["innings"]]).toarray(),
                            columns=[("innings_" + str(i)) for i in range(innings_count)])

# Saves the encoder into persistent store for later use

with open("Models/Innings_Encoder.pickle", "wb") as f:
    pickle.dump(innings_encoder, f, pickle.HIGHEST_PROTOCOL)

# Joins the encoded innings columns with the dataset

data_simple = data_simple.join(innings_encoded).drop(["innings"], axis = 1)

In [24]:
# Now, encodes teams with one-hot encoding technique

team_count = len(data_simple.batting_team.append(data_simple.bowling_team).unique())
team_encoder = OneHotEncoder(handle_unknown='ignore')
team_encoder.fit(pd.DataFrame(data_simple.batting_team.append(data_simple.bowling_team)))

batting_team_encoded = pd.DataFrame(team_encoder.transform(data_simple[["batting_team"]]).toarray(),
                                    columns=[("batting_team_" + str(i)) for i in range(team_count)])
bowling_team_encoded = pd.DataFrame(team_encoder.transform(data_simple[["bowling_team"]]).toarray(),
                                    columns=[("bowling_team_" + str(i)) for i in range(team_count)])

# Saves the encoder into persistent store for later use

with open("Models/Team_Encoder.pickle", "wb") as f:
    pickle.dump(team_encoder, f, pickle.HIGHEST_PROTOCOL)
    
# Joins the encoded team columns with the dataset

data_simple = data_simple.join(batting_team_encoded).drop(["batting_team"], axis = 1)
data_simple = data_simple.join(bowling_team_encoded).drop(["bowling_team"], axis = 1)

### Now, build a simple linear regression based machine learning model. 

In [25]:
# Removes the column "match_id" as it is not required for machine learning model

data_simple.drop(["match_id"], axis=1, inplace=True)

In [26]:
# Checks how the dataset looks before converting into array to feed into machine learning model

data_simple

Unnamed: 0,score_6_overs,venue_0,venue_1,venue_2,venue_3,venue_4,venue_5,venue_6,venue_7,venue_8,venue_9,venue_10,venue_11,venue_12,venue_13,venue_14,venue_15,venue_16,venue_17,venue_18,venue_19,venue_20,venue_21,venue_22,venue_23,venue_24,venue_25,venue_26,venue_27,venue_28,venue_29,venue_30,venue_31,venue_32,venue_33,venue_34,innings_0,innings_1,innings_2,innings_3,innings_4,innings_5,batting_team_0,batting_team_1,batting_team_2,batting_team_3,batting_team_4,batting_team_5,batting_team_6,batting_team_7,batting_team_8,batting_team_9,batting_team_10,batting_team_11,batting_team_12,bowling_team_0,bowling_team_1,bowling_team_2,bowling_team_3,bowling_team_4,bowling_team_5,bowling_team_6,bowling_team_7,bowling_team_8,bowling_team_9,bowling_team_10,bowling_team_11,bowling_team_12
0,61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1685,50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1686,54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1687,45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1688,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [27]:
# Converts DataFrame into 2D tensor

data_simple_array = data_simple.to_numpy()

In [28]:
# Seperates training labels

X_train, y_train = data_simple_array[:,1:], data_simple_array[:,0]

In [29]:
# Splits the available data into train and test data sets

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, shuffle = True)

In [36]:
# Create linear regressor
linear_regressor = LinearRegression(fit_intercept=True, normalize=False)

In [37]:
# Fits the model with training data

linear_regressor.fit(X_train, y_train)

LinearRegression()

In [47]:
# Performs predictions on the test data

predictions_linear_regressor = linear_regressor.predict(X_test)

In [48]:
# Calculates mean absolute error for all predictions

mean_absolute_error(y_test, predictions_linear_regressor)

9.030880177514794

In [40]:
# Saves the model into persistent store for later use

with open("Models/Linear_Regressor.pickle", "wb") as f:
    pickle.dump(linear_regressor, f, pickle.HIGHEST_PROTOCOL)

In [41]:
# Code for reading from persistent model

# with open("Models/Linear_Regressor.pickle", "rb") as f:
#     linear_regressor = pickle.load(f)

## With simple Linear Regression approach Mean Absolute Error (MAE) is around 9 which is better than that of what was achieved i.e. 10.7 using by Common Sense based model and hence it justified the effort and time to build machine learning models. This better performance will be treated as machine learning based baselined performance.

## Now, let's experiment with Decision Tree model to check if these can beat this machine learning based baselined performance.

In [42]:
# Creates decision tree regressor

decisionTree_regressor = DecisionTreeRegressor()

In [43]:
# Fits the model with training data

decisionTree_regressor.fit(X_train, y_train)

DecisionTreeRegressor()

In [49]:
# Performs predictions on the test data

predictions_decisionTree_regressor = decisionTree_regressor.predict(X_test)

In [50]:
# Calculates mean absolute error for all predictions

mean_absolute_error(y_test, predictions_decisionTree_regressor)

11.120364891518738

In [96]:
# Saves the model into persistent store for later use

with open("Models/Decisition_Tree_Regressor.pickle", "wb") as f:
    pickle.dump(decisionTree_regressor, f, pickle.HIGHEST_PROTOCOL)

## Decision tree based model scored around 11 as Mean Absolute Error (MAE) on test data. This is to note that this MAE is higher from both Common Sense based model and Linear Regressor model.

## Let's now try Random Forrest model.

In [59]:
# Creates Random Forest regressor
randomForest_regressor = RandomForestRegressor()

In [61]:
# Fits the model with training data

randomForest_regressor.fit(X_train, y_train)

RandomForestRegressor()

In [62]:
# Performs predictions on the test data

predictions_randomForest_regressor = randomForest_regressor.predict(X_test)

In [63]:
# Calculates mean absolute error for all predictions

mean_absolute_error(y_test, predictions_randomForest_regressor)

9.707392531449484

In [97]:
# Saves the model into persistent store for later use

with open("Models/Random_Forest_Regressor.pickle", "wb") as f:
    pickle.dump(randomForest_regressor, f, pickle.HIGHEST_PROTOCOL)

## The MAE of the Random Forest model is slightly higher than that of Linear Regression model, but less than that of both Common Sense and Decision Tree model.

## Let's now try Gradient Boosted Regressor with XGBoost

In [92]:
# Creates XGBoost regressor

xgboost_regressor = XGBRegressor()

In [93]:
# Fits the model with training data

xgboost_regressor.fit(X_train, y_train)



XGBRegressor()

In [94]:
# Performs predictions on the test data

predictions_xgboost_regressor = xgboost_regressor.predict(X_test)

In [95]:
# Calculates mean absolute error for all predictions

mean_absolute_error(y_test, predictions_xgboost_regressor)

8.951063758522801

In [98]:
# Saves the model into persistent store for later use

with open("Models/Gradient_Boosted_Regressor.pickle", "wb") as f:
    pickle.dump(xgboost_regressor, f, pickle.HIGHEST_PROTOCOL)

## Above output shows the Mean Absolute Error (MAE) is 8.95 and this performance on the test data is better than the performances of all the models e.g. Common Sense, Decision Tree and Random Forest that we have used so far.

## Hence, 8.95 is now being considered as new machine learning baselined performance.

## Next, refer the next notebook where we shall try out Deep Learning techniques to find if it can overperform present baselined performance.