## Model Development

In [22]:
import os,sys
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [2]:
data = pd.read_csv('data/final_score_data.csv')

In [3]:
data.columns

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'total_runs', 'player_dismissed', 'Cumsum_Total', 'wickets_lost',
       'total_wickets', 'Last_5overs_runs', 'Last_5overs_wickets',
       'Total_Score', 'id', 'season', 'city', 'venue'],
      dtype='object')

In [4]:
data.head(5)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,total_runs,player_dismissed,Cumsum_Total,wickets_lost,total_wickets,Last_5overs_runs,Last_5overs_wickets,Total_Score,id,season,city,venue
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,0,,0,0,0,0.0,0.0,207,1,2017,Hyderabad,"Rajiv Gandhi International Stadium, Uppal"
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,0,,0,0,0,0.0,0.0,207,1,2017,Hyderabad,"Rajiv Gandhi International Stadium, Uppal"
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,4,,4,0,0,4.0,0.0,207,1,2017,Hyderabad,"Rajiv Gandhi International Stadium, Uppal"
3,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,4,0,,4,0,0,4.0,0.0,207,1,2017,Hyderabad,"Rajiv Gandhi International Stadium, Uppal"
4,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,5,2,,6,0,0,6.0,0.0,207,1,2017,Hyderabad,"Rajiv Gandhi International Stadium, Uppal"


In [5]:
data['batting_team'].unique()

array(['Sunrisers Hyderabad', 'Royal Challengers Bangalore',
       'Mumbai Indians', 'Rising Pune Supergiant', 'Gujarat Lions',
       'Kolkata Knight Riders', 'Kings XI Punjab', 'Delhi Daredevils',
       'Chennai Super Kings', 'Rajasthan Royals', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Rising Pune Supergiants',
       'Delhi Capitals'], dtype=object)

In [6]:
data.groupby(by='batting_team')['season'].unique()

batting_team
Chennai Super Kings            [2008, 2009, 2010, 2011, 2012, 2013, 2014, 201...
Deccan Chargers                                   [2008, 2009, 2010, 2011, 2012]
Delhi Capitals                                                            [2019]
Delhi Daredevils               [2017, 2008, 2009, 2010, 2011, 2012, 2013, 201...
Gujarat Lions                                                       [2017, 2016]
Kings XI Punjab                [2017, 2008, 2009, 2010, 2011, 2012, 2013, 201...
Kochi Tuskers Kerala                                                      [2011]
Kolkata Knight Riders          [2017, 2008, 2009, 2010, 2011, 2012, 2013, 201...
Mumbai Indians                 [2017, 2008, 2009, 2010, 2011, 2012, 2013, 201...
Pune Warriors                                                 [2011, 2012, 2013]
Rajasthan Royals               [2008, 2009, 2010, 2011, 2012, 2013, 2014, 201...
Rising Pune Supergiant                                                    [2017]
Rising Pune Sup

In [7]:
## dropping the below Teams
drop_teams = ['Pune Warriors','Kochi Tuskers Kerala']
drop_teams_rowId = data[data['batting_team'].isin(drop_teams) | data['bowling_team'].isin(drop_teams)].index

data.drop(labels=drop_teams_rowId,axis=0,inplace=True)

In [8]:
## Replacing some teams with other teams and also replacing with Shortnames
rename_map = {'Rising Pune Supergiant':'CSK','Chennai Super Kings':'CSK','Rising Pune Supergiants':'CSK',
              'Deccan Chargers':'SRH', 'Sunrisers Hyderabad':'SRH',
              'Delhi Daredevils' : 'DC','Delhi Capitals':'DC',
              'Gujarat Lions' : 'RR', 'Rajasthan Royals':'RR',
              'Kings XI Punjab' : 'KXIP',
              'Royal Challengers Bangalore' : 'RCB',
              'Mumbai Indians' : 'MI',
              'Kolkata Knight Riders' : 'KKR'}
              
data['batting_team'] = data['batting_team'].map(rename_map).fillna(data['batting_team'])
data['bowling_team'] = data['bowling_team'].map(rename_map).fillna(data['bowling_team'])

In [9]:
## Location and venues
print(data['venue'].nunique())
data['venue'].unique()

40


array(['Rajiv Gandhi International Stadium, Uppal',
       'Maharashtra Cricket Association Stadium',
       'Saurashtra Cricket Association Stadium', 'Holkar Cricket Stadium',
       'M Chinnaswamy Stadium', 'Wankhede Stadium', 'Eden Gardens',
       'Feroz Shah Kotla',
       'Punjab Cricket Association IS Bindra Stadium, Mohali',
       'Green Park', 'Punjab Cricket Association Stadium, Mohali',
       'Sawai Mansingh Stadium', 'MA Chidambaram Stadium, Chepauk',
       'Dr DY Patil Sports Academy', 'Newlands', "St George's Park",
       'Kingsmead', 'SuperSport Park', 'Buffalo Park',
       'New Wanderers Stadium', 'De Beers Diamond Oval',
       'OUTsurance Oval', 'Brabourne Stadium',
       'Sardar Patel Stadium, Motera', 'Barabati Stadium',
       'Vidarbha Cricket Association Stadium, Jamtha',
       'Himachal Pradesh Cricket Association Stadium',
       'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'Subrata Roy Sahara Stadium',
       'Shaheed Veer Narayan Singh

In [10]:
## Mapping the Stadium names(venue), same stadium name is written in different ways
rename_stadium = {'Rajiv Gandhi International Stadium, Uppal' : 'RGIS',
                    'Maharashtra Cricket Association Stadium':'MCAS',
                    'Saurashtra Cricket Association Stadium' : 'SCAS' , 
                    'Holkar Cricket Stadium': 'HCS',
                    'M Chinnaswamy Stadium' : 'MCS',
                    'Wankhede Stadium' : 'WS', 
                    'Eden Gardens':'Eden',
                    'Feroz Shah Kotla' : 'FSK',
                    'Punjab Cricket Association IS Bindra Stadium, Mohali' : 'PCAS',
                    'Green Park' : 'GreenPark', 
                    'Punjab Cricket Association Stadium, Mohali' : 'PCAS',
                    'Sawai Mansingh Stadium' : 'SMS', 
                    'MA Chidambaram Stadium, Chepauk' : 'MACS',
                    'Dr DY Patil Sports Academy' : 'DYPSA', 
                    'Newlands' : 'Newlands', 
                    "St George's Park" : 'GeorgePark',
                    'Kingsmead' : 'Kingsmead', 
                    'SuperSport Park' : 'SuperSport', 
                    'Buffalo Park' : 'BuffaloPark',
                    'New Wanderers Stadium' : 'NewWanderers', 
                    'De Beers Diamond Oval' : 'DeBeersDiamond',
                    'OUTsurance Oval' : 'OUTsurance', 
                    'Brabourne Stadium' : 'Brabourne',
                    'Sardar Patel Stadium, Motera' : 'SPS',
                    'Barabati Stadium' : 'Barabati',
                    'Vidarbha Cricket Association Stadium, Jamtha' : 'VCAS',
                    'Himachal Pradesh Cricket Association Stadium' : 'HPCAS',
                    'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium' : 'YSRACACS',
                    'Subrata Roy Sahara Stadium' : 'SRSS',
                    'Shaheed Veer Narayan Singh International Stadium' : 'SVNSIS',
                    'JSCA International Stadium Complex' : 'JSCAISC', 
                    'Sheikh Zayed Stadium' : 'SZS',
                    'Sharjah Cricket Stadium' : 'Sharjah', 
                    'Dubai International Cricket Stadium' : 'DICS',
                    'M. A. Chidambaram Stadium' : 'MACS', 
                    'Feroz Shah Kotla Ground' : 'FSKG',
                    'M. Chinnaswamy Stadium' : 'MCS', 
                    'Rajiv Gandhi Intl. Cricket Stadium' : 'RGIS',
                    'IS Bindra Stadium' : 'PCAS', 
                    'ACA-VDCA Stadium' : 'YSRACACS' }

data['venue'] = data['venue'].map(rename_stadium).fillna(data['venue'])
print(data['venue'].nunique())

34


In [11]:
data.sort_values(by=['season','match_id','inning'],inplace=True)
data.reset_index(drop='index',inplace=True)

In [12]:
cols_remove = ['match_id','inning','total_runs', 'wickets_lost','player_dismissed','id','city']
data.drop(labels = cols_remove,axis=1,inplace=True)

In [13]:
data.head(4)

Unnamed: 0,batting_team,bowling_team,over,ball,Cumsum_Total,total_wickets,Last_5overs_runs,Last_5overs_wickets,Total_Score,season,venue
0,KKR,RCB,1,1,1,0,1.0,0.0,222,2008,MCS
1,KKR,RCB,1,2,1,0,1.0,0.0,222,2008,MCS
2,KKR,RCB,1,3,2,0,2.0,0.0,222,2008,MCS
3,KKR,RCB,1,4,2,0,2.0,0.0,222,2008,MCS


In [14]:
## renaming cumsum_total to current_score
data.rename(columns={'Cumsum_Total':'Current_Score'},inplace=True)


In [15]:
## decreasing 1 value from overs
data['over'] = data['over'] -1
data = data[data['over']>5]

In [16]:
### creating dummies
final_data = pd.get_dummies(data,columns=['batting_team','bowling_team','venue'])

In [18]:
final_data.head(3)

Unnamed: 0,over,ball,Current_Score,total_wickets,Last_5overs_runs,Last_5overs_wickets,Total_Score,season,batting_team_CSK,batting_team_DC,...,venue_SMS,venue_SPS,venue_SRSS,venue_SVNSIS,venue_SZS,venue_Sharjah,venue_SuperSport,venue_VCAS,venue_WS,venue_YSRACACS
38,6,1,62,1,55.0,1.0,222,2008,0,0,...,0,0,0,0,0,0,0,0,0,0
39,6,2,63,1,52.0,1.0,222,2008,0,0,...,0,0,0,0,0,0,0,0,0,0
40,6,3,64,1,47.0,1.0,222,2008,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
## Splitting the data
X_train = final_data.drop(labels='Total_Score',axis=1)[final_data['season']<=2016]
X_test = final_data.drop(labels='Total_Score',axis=1)[final_data['season'] > 2016]

Y_train = final_data[final_data['season']<=2016]['Total_Score'].values
Y_test = final_data[final_data['season'] > 2016]['Total_Score'].values

## Removing the season column
# Removing the 'date' column
X_train.drop(labels='season', axis=True, inplace=True)
X_test.drop(labels='season', axis=True, inplace=True)

In [23]:
## Metrics
def metrics_score(actual_values,predict_values):
    print("Mean absloute error : ", mean_absolute_error(actual_values,predict_values))
    print("Mean Squared error : ", mean_squared_error(actual_values,predict_values))
    print("RMSE : ", np.sqrt(mean_squared_error(actual_values,predict_values)))
    print("R2 Score : ", r2_score(actual_values,predict_values))


## Linear Regression Model

In [20]:
regressor = LinearRegression()
regressor.fit(X_train,Y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [24]:
y_train_pred = regressor.predict(X_train)
print("Training metrics")
metrics_score(Y_train,y_train_pred)

Training metrics
Mean absloute error :  14.14904127108957
Mean Squared error :  363.1296927755925
RMSE :  19.055962131983588
R2 Score :  0.5507523019375072


In [25]:
y_test_pred = regressor.predict(X_test)
print("Testing metrics")
metrics_score(Y_test,y_test_pred)

Testing metrics
Mean absloute error :  15.794007754188387
Mean Squared error :  455.751814581391
RMSE :  21.34834453959817
R2 Score :  0.47688270174574965


## Lasso

In [27]:
lasso = Lasso()
parameters = {'alpha':[1e-10,1e-8,1e-3,1e-2,1,5,10,20,30]}
lassoReg = GridSearchCV(lasso,param_grid=parameters,cv=5,n_jobs=-1,scoring='neg_mean_squared_error')

lassoReg.fit(X_train,Y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': [1e-10, 1e-08, 0.001, 0.01, 1, 5, 10, 20,
                                   30]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [28]:
## best parameters
print(lassoReg.best_estimator_)
print(lassoReg.best_params_)
print(lassoReg.best_score_)

Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False,
      positive=False, precompute=False, random_state=None, selection='cyclic',
      tol=0.0001, warm_start=False)
{'alpha': 1}
-385.0509158591908


In [29]:
y_train_pred = lassoReg.predict(X_train)
print("Training metrics")
metrics_score(Y_train,y_train_pred)

y_test_pred = lassoReg.predict(X_test)
print("Testing metrics")
metrics_score(Y_test,y_test_pred)

Training metrics
Mean absloute error :  14.422611513661609
Mean Squared error :  382.13404876161894
RMSE :  19.548249250549752
R2 Score :  0.5272409687974804
Testing metrics
Mean absloute error :  15.7186169393189
Mean Squared error :  455.0458791978137
RMSE :  21.331804405577454
R2 Score :  0.47769298268986005


## Random Forest

In [34]:
rf = RandomForestRegressor( random_state=1405)

parameters = {'n_estimators':[51,101,151],
              'max_depth' : [25,30,35,40],
              'max_features' : ['auto','sqrt'],
               'bootstrap' : [True],
               'oob_score' :[True]
              }

rfcModel = GridSearchCV(estimator = rf, param_grid=parameters,n_jobs=-1,cv=10)
rfcModel.fit(X_train,Y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=1405,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_j

In [41]:
###
y_train_pred = rfcModel.predict(X_train)
print("Training metrics")
metrics_score(Y_train,y_train_pred)

y_test_pred = rfcModel.predict(X_test)
print("Testing metrics")
metrics_score(Y_test,y_test_pred)

Training metrics
Mean absloute error :  2.4889630445941893
Mean Squared error :  16.108873725261603
RMSE :  4.013586142748353
R2 Score :  0.9800708270807108
Testing metrics
Mean absloute error :  16.46637334454734
Mean Squared error :  510.18298591731497
RMSE :  22.58723059423875
R2 Score :  0.4144059624787517


### Model got trained more(overfit), need to tune it

In [42]:
## Saving the existing model
joblib.dump(rfcModel, 'predict_score.sav')

['predict_score.sav']