# Cricket Prediction using machine learning

## Data Preprocessing

In [1]:
import pandas as pd

In [2]:
dataset=pd.read_csv('ipl.csv',index_col=0)

In [3]:
dataset = dataset.drop(columns=['gender', 'match_type','date','umpire_1','umpire_2','player of the match','win_by_runs','win_by_wickets'])

In [4]:
# columns with missing values
dataset.columns[dataset.isnull().any()]

Index(['city'], dtype='object')

In [5]:
dataset['city'].fillna(dataset['city'].mode()[0], inplace=True)

In [6]:
dataset.columns[dataset.isnull().any()]

dataset.replace(['Mumbai Indians','Kolkata Knight Riders','Royal Challengers Bangalore','Deccan Chargers','Chennai Super Kings',
                 'Rajasthan Royals','Delhi Daredevils','Gujarat Lions','Kings XI Punjab',
                 'Sunrisers Hyderabad','Rising Pune Supergiants','Kochi Tuskers Kerala','Pune Warriors','Rising Pune Supergiant']
                ,['MI','KKR','RCB','DC','CSK','RR','DD','GL','KXIP','SRH','RPS','KTK','PW','RPS'],inplace=True)

In [7]:
def createDict(series) :
    
    dictionary={}
    
    i=0
    
    for ser in series :
        if(ser in dictionary) :
            continue
        dictionary[ser]=i
        i=i+1
        
    return dictionary

In [8]:
teamDict=createDict(dataset['team 1'])

toss_winnerDict=createDict(dataset['toss_winner'])

cityDict=createDict(dataset['city'])

venueDict=createDict(dataset['venue'])

winnerDict=dict(teamDict)

winnerDict['tie']=14

winnerDict['no result']=15

In [9]:
encode = {
'team 1': teamDict,
'team 2': teamDict,
'toss_winner': teamDict,
'winner': winnerDict,
'city':cityDict,
'venue':venueDict
    
 }
dataset.replace(encode, inplace=True)

In [10]:
dataset.head(5)

Unnamed: 0,city,team 1,team 2,toss_decision,toss_winner,venue,winner
0,0,0,4,field,0,0,4
1,1,1,7,bat,7,1,7
2,2,2,5,bat,5,2,2
3,3,3,0,bat,3,3,0
4,4,4,6,bat,6,4,4


In [11]:
winner = dataset['winner']

In [12]:
features = dataset.drop('winner',axis=1)

In [13]:
features=pd.get_dummies(features)

In [14]:
from sklearn.preprocessing import OneHotEncoder

def oneHotEncode(col,df) :
    
    ohe = OneHotEncoder(sparse=False)
    
    temp = ohe.fit_transform(features[[col]])
    
    currDict={}
    
    if(col=='team 1' or col=='team 2' or col=='toss_winner'):
        
        currDict=teamDict
    
    if(col=='city') :
        
        currDict=cityDict
        
    if(col=='venue') :
        
        currDict=venueDict
        
    temp=pd.DataFrame(temp,columns={
        value for value in [col+'_'+key for key,val in currDict.items()]
    })
        

    df=df.join(temp)
    
    df=df.drop(columns=col)
    
    return df

In [15]:
features = oneHotEncode('team 1',features)

features = oneHotEncode('city',features)

features = oneHotEncode('team 2',features)

features = oneHotEncode('toss_winner',features)

In [16]:
features

Unnamed: 0,venue,toss_decision_bat,toss_decision_field,team 1_SRH,team 1_RPS,team 1_DC,team 1_CSK,team 1_MI,team 1_KXIP,team 1_KTK,...,toss_winner_PW,toss_winner_KXIP,toss_winner_RPS,toss_winner_DD,toss_winner_GL,toss_winner_CSK,toss_winner_KTK,toss_winner_DC,toss_winner_MI,toss_winner_RCB
0,0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,1,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,1,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,1,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6,1,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,6,0,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    winner,random_state=0,test_size =.25)

In [19]:
def prediction(Model,X_train,y_train,X_test,y_test) :
    
    clf=Model()
    
    clf.fit(X_train,y_train)
    
    print(clf.score(X_test,y_test))
    
    return clf


In [20]:
from sklearn.neural_network import MLPClassifier

from sklearn.svm import LinearSVC

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

clf_A = prediction(MLPClassifier,X_train,y_train,X_test,y_test)

clf_B = prediction(LinearSVC,X_train,y_train,X_test,y_test)

clf_C = prediction(LogisticRegression,X_train,y_train,X_test,y_test)

clf_D = prediction(RandomForestClassifier,X_train,y_train,X_test,y_test)



0.5534591194968553
0.5031446540880503
0.5786163522012578
0.4968553459119497


