In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

In [2]:
matches = pd.read_csv('IPL Matches 2008-2020.csv')

In [3]:
matches = matches.iloc[:,0:17]

In [4]:
matches.isna().sum()

id                   0
city                13
date                 0
player_of_match      4
venue                0
neutral_venue        0
team1                0
team2                0
toss_winner          0
toss_decision        0
winner               4
result               4
result_margin       17
eliminator           4
method             797
umpire1              0
umpire2              0
dtype: int64

In [5]:
matches['method'] = matches['method'].apply(lambda x: 1 if x == 'D/L' else 0)

In [6]:
matches.loc[matches['result']=='tie','result_margin'] = 0

In [7]:
matches.loc[matches.city =='Bengaluru','city'] = 'Bangalore'

In [8]:
matches.team1.unique()

array(['Royal Challengers Bangalore', 'Kings XI Punjab',
       'Delhi Daredevils', 'Mumbai Indians', 'Kolkata Knight Riders',
       'Rajasthan Royals', 'Deccan Chargers', 'Chennai Super Kings',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Gujarat Lions', 'Rising Pune Supergiants',
       'Rising Pune Supergiant', 'Delhi Capitals'], dtype=object)

In [9]:
matches['team1'].replace('Rising Pune Supergiant', 'Rising Pune Supergiants', inplace = True)
matches['team2'].replace('Rising Pune Supergiant', 'Rising Pune Supergiants', inplace = True)

In [10]:
Team_Name = matches.team1.unique()
y = ['RCB','PBKS','DD','MI','KKR','RR','Deccan Chargers','SRH','CSK','KTK','PW','GL','RPS','DC']

matches.replace(Team_Name,y,inplace=True)

In [11]:
matches.loc[matches.venue =='Sharjah Cricket Stadium','city'] = 'Sharjah'
matches.loc[matches.venue =='Dubai International Cricket Stadium','city'] = 'Dubai'

In [12]:
matches.isna().sum()

id                 0
city               0
date               0
player_of_match    4
venue              0
neutral_venue      0
team1              0
team2              0
toss_winner        0
toss_decision      0
winner             4
result             4
result_margin      4
eliminator         4
method             0
umpire1            0
umpire2            0
dtype: int64

In [13]:
matches = matches.dropna()

In [14]:
matches.shape

(812, 17)

In [15]:
matches.columns

Index(['id', 'city', 'date', 'player_of_match', 'venue', 'neutral_venue',
       'team1', 'team2', 'toss_winner', 'toss_decision', 'winner', 'result',
       'result_margin', 'eliminator', 'method', 'umpire1', 'umpire2'],
      dtype='object')

In [16]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 812 entries, 0 to 815
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               812 non-null    int64  
 1   city             812 non-null    object 
 2   date             812 non-null    object 
 3   player_of_match  812 non-null    object 
 4   venue            812 non-null    object 
 5   neutral_venue    812 non-null    int64  
 6   team1            812 non-null    object 
 7   team2            812 non-null    object 
 8   toss_winner      812 non-null    object 
 9   toss_decision    812 non-null    object 
 10  winner           812 non-null    object 
 11  result           812 non-null    object 
 12  result_margin    812 non-null    float64
 13  eliminator       812 non-null    object 
 14  method           812 non-null    int64  
 15  umpire1          812 non-null    object 
 16  umpire2          812 non-null    object 
dtypes: float64(1), i

In [17]:
from sklearn.preprocessing  import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score

In [18]:
matches.head()

Unnamed: 0,id,city,date,player_of_match,venue,neutral_venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,eliminator,method,umpire1,umpire2
0,335982,Bangalore,18-04-2008,BB McCullum,M Chinnaswamy Stadium,0,RCB,KKR,RCB,field,KKR,runs,140.0,N,0,Asad Rauf,RE Koertzen
1,335983,Chandigarh,19-04-2008,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",0,PBKS,SRH,SRH,bat,SRH,runs,33.0,N,0,MR Benson,SL Shastri
2,335984,Delhi,19-04-2008,MF Maharoof,Feroz Shah Kotla,0,DD,RR,RR,bat,DD,wickets,9.0,N,0,Aleem Dar,GA Pratapkumar
3,335985,Mumbai,20-04-2008,MV Boucher,Wankhede Stadium,0,MI,RCB,MI,bat,RCB,wickets,5.0,N,0,SJ Davis,DJ Harper
4,335986,Kolkata,20-04-2008,DJ Hussey,Eden Gardens,0,KKR,Deccan Chargers,Deccan Chargers,bat,KKR,wickets,5.0,N,0,BF Bowden,K Hariharan


In [19]:
matches['result_margin'] = matches['result_margin'].astype(int)

In [20]:
matches['eliminator'] = matches['eliminator'].apply(lambda x: 0 if x == 'N' else 1)

In [21]:
matches['toss_decision'] = matches['toss_decision'].apply(lambda x: 1 if x =='bat' else 0)

In [22]:
matches['toss_win_game_win'] = np.where((matches.toss_winner == matches.winner),'Yes','No')
matches['toss_win_game_win'] = matches['toss_win_game_win'].apply(lambda x: 0 if x=='No' else 1)

In [23]:
matches['team1_homeground'] = 0
matches.loc[(matches['city']=='Mumbai') & (matches['team1']=='MI'),'team1_homeground'] = 1
matches.loc[(matches['city']=='Pune') & (matches['team1']=='PW'),'team1_homeground'] = 1
matches.loc[(matches['city']=='Pune') & (matches['team1']=='RPS'),'team1_homeground'] = 1
matches.loc[(matches['city']=='Chennai') & (matches['team1']=='CSK'),'team1_homeground'] = 1
matches.loc[(matches['city']=='Delhi') & (matches['team1']=='Delhi Capitals'),'team1_homeground'] = 1
matches.loc[(matches['city']=='Jaipur') & (matches['team1']=='RR'),'team1_homeground'] = 1
matches.loc[(matches['city']=='Hyderabad') & (matches['team1']=='SRH'),'team1_homeground'] = 1
matches.loc[(matches['city']=='Bangalore') & (matches['team1']=='RCB'),'team1_homeground'] = 1
matches.loc[(matches['city']=='Chandigarh') & (matches['team1']=='PBKS'),'team1_homeground'] = 1
matches.loc[(matches['city']=='Kolkata') & (matches['team1']=='KKR'),'team1_homeground'] = 1
matches.loc[(matches['city']=='Kochi') & (matches['team1']=='KTK'),'team1_homeground'] = 1
matches.loc[(matches['city']=='Rajkot') & (matches['team1']=='GL'),'team1_homeground'] = 1
matches.loc[(matches['city']=='Delhi') & (matches['team1']=='DD'),'team1_homeground'] = 1
matches.loc[(matches['city']=='Hyderabad') & (matches['team1']=='DC'),'team1_homeground'] = 1

In [24]:
matches.drop(['city','player_of_match'], axis = 1, inplace = True)
matches.drop(['date', 'id'], axis = 1, inplace = True)
matches.drop('venue', axis =1, inplace = True)

In [25]:
matches.drop(['umpire1','umpire2'], axis = 1, inplace = True)

In [26]:
current_teams = ['KKR','MI','DC','CSK','RR','PBKS','SRH','RCB']
data = matches[(matches['team1'].isin(current_teams)) & (matches['team2'].isin(current_teams))]

In [27]:
data.loc[data["winner"]==data["team1"],"team1_win"]=1
data.loc[data["winner"]!=data["team1"],"team1_win"]=0
data['team1_win'] = data['team1_win'].astype(int)

In [28]:
data.shape

(404, 13)

In [29]:
data_vif = data

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score

In [31]:
import statsmodels.api as sm


def calculate_vif(data):
    vif_df = pd.DataFrame(columns=['Var', 'Vif'])
    x_var_names = data.columns
    for i in range(0, x_var_names.shape[0]):
        y = data[x_var_names[i]]
        x = data[x_var_names.drop([x_var_names[i]])]
        r_squared = sm.OLS(y, x).fit().rsquared
        vif = round(1 / (1 - r_squared), 2)
        vif_df.loc[i] = [x_var_names[i], vif]
    return vif_df.sort_values(by='Vif', axis=0, ascending=False, inplace=False)

In [32]:
modeldata = data[['team1','team2','toss_winner','toss_decision','result','result_margin','eliminator','method','toss_win_game_win','team1_homeground','team1_win']]

In [33]:
modeldata.loc[modeldata["team1"]==modeldata["toss_winner"],"team1_toss_win"]=1
modeldata.loc[modeldata["team1"]!=modeldata["toss_winner"],"team1_toss_win"]=0
modeldata['team1_toss_win'] = modeldata['team1_toss_win'].astype(int)

In [34]:
matches.result.unique()

array(['runs', 'wickets', 'tie'], dtype=object)

## MODEL

### Model with variable Eliminator and Method

In [35]:
data1 = modeldata[['team1','team2','toss_decision','result','eliminator','method','team1_homeground','team1_win'
                   ,'team1_toss_win']]
data = pd.get_dummies(
    data=data1, columns=['team1', 'team2', 'result'])
#data

x = data.drop('team1_win', axis = 1)
y = data['team1_win']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [36]:
#Decision Tree
dtree=DecisionTreeClassifier()
dtree.fit(x_train,y_train)
y_pred_dt = dtree.predict(x_test)
#y_score_dt = 
print('Accuracy of Decision Tree Classifier on test set: {:.4f}'.format(dtree.score(x_test, y_test)))

# Confusion Matrix
#print('Confusion Matrix')
cm_dt =confusion_matrix(y_test, y_pred_dt)

# Accuracy
#print('Accuracy')
as_dt = accuracy_score(y_test, y_pred_dt)*100

# Recall
#print('Recall')
rs_dt = recall_score(y_test, y_pred_dt, average=None)

# Precision
#print('Precision')
ps_dt = precision_score(y_test, y_pred_dt, average=None)

#Classification Report
#print('Classification Report')
print(classification_report(y_test, y_pred_dt))

Accuracy of Decision Tree Classifier on test set: 0.9383
              precision    recall  f1-score   support

           0       0.94      0.91      0.92        33
           1       0.94      0.96      0.95        48

    accuracy                           0.94        81
   macro avg       0.94      0.93      0.94        81
weighted avg       0.94      0.94      0.94        81



In [37]:
as_dt

93.82716049382715

In [38]:
#Random Forest Classifier
rf= RandomForestClassifier(n_estimators=100)
rf.fit(x_train,y_train)
y_pred_rf = rf.predict(x_test)
print('Accuracy of Random Forest Classifier on test set: {:.4f}'.format(rf.score(x_test, y_test)))

# Confusion Matrix
#print('Confusion Matrix')
cm_rf = confusion_matrix(y_test, y_pred_rf)
print(cm_rf)
# Accuracy
#print('Accuracy')
as_rf = accuracy_score(y_test, y_pred_rf)*100
print(as_rf)
# Recall
#print('Recall')
rs_rf = recall_score(y_test, y_pred_rf, average=None)

# Precision
#print('Precision')
ps_rf = precision_score(y_test, y_pred_rf, average=None)

#Classification Report
#print('Classification Report')
cr_rf = classification_report(y_test, y_pred_rf)
print(cr_rf)


Accuracy of Random Forest Classifier on test set: 0.8395
[[26  7]
 [ 6 42]]
83.9506172839506
              precision    recall  f1-score   support

           0       0.81      0.79      0.80        33
           1       0.86      0.88      0.87        48

    accuracy                           0.84        81
   macro avg       0.83      0.83      0.83        81
weighted avg       0.84      0.84      0.84        81



### Model without variable Eliminator and Method

In [39]:
data1 = modeldata[['team1','team2','toss_decision','result','team1_homeground','team1_win'
                   ,'team1_toss_win']]
data = pd.get_dummies(data=data1, columns=['team1', 'team2', 'result'])
#data

x = data.drop('team1_win', axis = 1)
y = data['team1_win']
print(x.shape)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
x_train.shape, x_test.shape

(404, 22)


((323, 22), (81, 22))

In [40]:
print(x_train.loc[0])
y_train.loc[0]

toss_decision       0
team1_homeground    1
team1_toss_win      1
team1_CSK           0
team1_DC            0
team1_KKR           0
team1_MI            0
team1_PBKS          0
team1_RCB           1
team1_RR            0
team1_SRH           0
team2_CSK           0
team2_DC            0
team2_KKR           1
team2_MI            0
team2_PBKS          0
team2_RCB           0
team2_RR            0
team2_SRH           0
result_runs         1
result_tie          0
result_wickets      0
Name: 0, dtype: int64


0

In [41]:
#Decision Tree
dtree1=DecisionTreeClassifier()
dtree1.fit(x_train,y_train)
y_pred_dt1 = dtree1.predict(x_test)
#y_score_dt = 
print('Accuracy of Decision Tree Classifier on test set: {:.4f}'.format(dtree1.score(x_test, y_test)))

# Confusion Matrix
#print('Confusion Matrix')
cm_dt1 =confusion_matrix(y_test, y_pred_dt1)

# Accuracy
#print('Accuracy')
as_dt1 = accuracy_score(y_test, y_pred_dt1)*100

# Recall
#print('Recall')
rs_dt1 = recall_score(y_test, y_pred_dt1, average=None)

# Precision
#print('Precision')
ps_dt1 = precision_score(y_test, y_pred_dt1, average=None)

#Classification Report
#print('Classification Report')
cr_dt1 = classification_report(y_test, y_pred_dt1)

Accuracy of Decision Tree Classifier on test set: 0.9259


In [42]:
import pickle
file = open('tree11.pkl', 'wb')

pickle.dump(dtree1, file)

In [43]:
#Random Forest Classifier
rf1= RandomForestClassifier(n_estimators=100)
rf1.fit(x_train,y_train)
y_pred_rf1 = rf1.predict(x_test)
print('Accuracy of Random Forest Classifier on test set: {:.4f}'.format(rf1.score(x_test, y_test)))

# Confusion Matrix
#print('Confusion Matrix')
cm_rf1 = confusion_matrix(y_test, y_pred_rf1)
# Accuracy
#print('Accuracy')
as_rf1 = accuracy_score(y_test, y_pred_rf1)*100
# Recall
#print('Recall')
rs_rf1 = recall_score(y_test, y_pred_rf1, average=None)
# Precision
#print('Precision')
ps_rf1 = precision_score(y_test, y_pred_rf1, average=None)
#Classification Report
#print('Classification Report')
cr_rf1 = classification_report(y_test, y_pred_rf1)



Accuracy of Random Forest Classifier on test set: 0.8025


In [44]:
dtree1.predict([[0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1]])

array([1])

In [45]:
import pickle
file = open('decision_tree1.pkl', 'wb')

pickle.dump(dtree1, file)

In [52]:
x_train.loc[0]

toss_decision       0
team1_homeground    1
team1_toss_win      1
team1_CSK           0
team1_DC            0
team1_KKR           0
team1_MI            0
team1_PBKS          0
team1_RCB           1
team1_RR            0
team1_SRH           0
team2_CSK           0
team2_DC            0
team2_KKR           1
team2_MI            0
team2_PBKS          0
team2_RCB           0
team2_RR            0
team2_SRH           0
result_runs         1
result_tie          0
result_wickets      0
Name: 0, dtype: int64

In [54]:
y_train.loc[0]

0